From 525213d2f5da1eaf4b922b6b792cb52b2c613368 Mon Sep 17 00:00:00 2001 From: Pierrick Hymbert Date: Sat, 24 Feb 2024 12:28:55 +0100 Subject: [PATCH 01/18] server: init functional tests (#5566) * server: tests: init scenarios - health and slots endpoints - completion endpoint - OAI compatible chat completion requests w/ and without streaming - completion multi users scenario - multi users scenario on OAI compatible endpoint with streaming - multi users with total number of tokens to predict exceeds the KV Cache size - server wrong usage scenario, like in Infinite loop of "context shift" #3969 - slots shifting - continuous batching - embeddings endpoint - multi users embedding endpoint: Segmentation fault #5655 - OpenAI-compatible embeddings API - tokenize endpoint - CORS and api key scenario * server: CI GitHub workflow --------- Co-authored-by: Georgi Gerganov --- .github/ISSUE_TEMPLATE/bug.md | 2 + .github/workflows/server.yml | 127 ++++ examples/server/README.md | 6 + examples/server/server.cpp | 36 +- examples/server/tests/README.md | 46 ++ examples/server/tests/features/environment.py | 67 ++ examples/server/tests/features/issues.feature | 36 + .../server/tests/features/parallel.feature | 77 ++ .../server/tests/features/security.feature | 50 ++ examples/server/tests/features/server.feature | 69 ++ examples/server/tests/features/steps/steps.py | 709 ++++++++++++++++++ .../tests/features/wrong_usages.feature | 21 + examples/server/tests/requirements.txt | 3 + examples/server/tests/tests.sh | 12 + 14 files changed, 1243 insertions(+), 18 deletions(-) create mode 100644 .github/workflows/server.yml create mode 100644 examples/server/tests/README.md create mode 100644 examples/server/tests/features/environment.py create mode 100644 examples/server/tests/features/issues.feature create mode 100644 examples/server/tests/features/parallel.feature create mode 100644 examples/server/tests/features/security.feature create mode 100644 examples/server/tests/features/server.feature create mode 100644 examples/server/tests/features/steps/steps.py create mode 100644 examples/server/tests/features/wrong_usages.feature create mode 100644 examples/server/tests/requirements.txt create mode 100755 examples/server/tests/tests.sh diff --git a/.github/ISSUE_TEMPLATE/bug.md b/.github/ISSUE_TEMPLATE/bug.md index ce69e6395daae..49812832ca542 100644 --- a/.github/ISSUE_TEMPLATE/bug.md +++ b/.github/ISSUE_TEMPLATE/bug.md @@ -7,3 +7,5 @@ assignees: '' --- Please include information about your system, the steps to reproduce the bug, and the version of llama.cpp that you are using. If possible, please provide a minimal code example that reproduces the bug. + +If the bug concerns the server, please try to reproduce it first using the [server test scenario framework](https://github.com/ggerganov/llama.cpp/tree/master/examples/server/tests). diff --git a/.github/workflows/server.yml b/.github/workflows/server.yml new file mode 100644 index 0000000000000..ed27dc528fb61 --- /dev/null +++ b/.github/workflows/server.yml @@ -0,0 +1,127 @@ +# Server build and tests +name: Server + +on: + workflow_dispatch: # allows manual triggering + push: + branches: + - master + - test/server-add-ci-test # FIXME remove + paths: ['.github/workflows/**', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/**.*'] + pull_request: + types: [opened, synchronize, reopened] + paths: ['**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/**.*'] + +jobs: + server: + runs-on: ubuntu-latest + + strategy: + matrix: + build: [noavx, avx2, avx, avx512, cublas, clblast, openblas, kompute, vulkan] + sanitizer: [ADDRESS, THREAD, UNDEFINED] + build_type: [Debug, Release] + include: + - build: 'noavx' + defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX=OFF -DLLAMA_AVX2=OFF -DLLAMA_FMA=OFF' + image: ubuntu:latest + - build: 'avx2' + defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON' + image: ubuntu:latest + - build: 'avx' + defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX2=OFF' + image: ubuntu:latest + - build: 'avx512' + defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX512=ON' + image: ubuntu:latest + experimental: true + - build: 'cublas' + defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUBLAS=ON' + image: nvidia/cuda:12.3.1-devel-ubuntu22.04 + arch_not_available: true # require nvidia docker engine + - build: 'clblast' + defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_CLBLAST=ON' + image: ubuntu:latest + arch_not_available: true + - build: 'openblas' + defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS' + image: ubuntu:latest + - build: 'kompute' + defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_KOMPUTE=ON -DKOMPUTE_OPT_DISABLE_VULKAN_VERSION_CHECK=ON' + image: ubuntu:latest + arch_not_available: true + - build: 'vulkan' + defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_VULKAN=ON' + image: ubuntu:latest + arch_not_available: true + + container: + image: ${{ matrix.image }} + ports: + - 8888 + options: --cpus 4 + + steps: + - name: Clone + id: checkout + uses: actions/checkout@v3 + + - name: Dependencies + id: depends + run: | + apt-get update + apt-get -y install \ + build-essential \ + pkg-config \ + git \ + cmake \ + python3-pip \ + wget \ + psmisc + + - name: Download CLBlast + id: get_clblast + if: ${{ matrix.build == 'clblast' }} + run: | + apt install -y libclblast-dev + + - name: Download OpenBLAS + id: get_openblas + if: ${{ matrix.build == 'openblas' }} + run: | + apt-get -y install libopenblas-dev + + - name: Install Vulkan SDK + id: get_vulkan + if: ${{ matrix.build == 'kompute' || matrix.build == 'vulkan' }} + run: | + wget -qO- https://packages.lunarg.com/lunarg-signing-key-pub.asc | tee /etc/apt/trusted.gpg.d/lunarg.asc + wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list http://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list + apt-get update + apt-get -y install vulkan-sdk + + - name: Build + id: cmake_build + run: | + mkdir build + cd build + cmake .. -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} ${{ matrix.defines }} + cmake --build . --config ${{ matrix.build_type }} -j $(nproc) --target server + + - name: Tests dependencies + id: test_dependencies + run: | + pip install -r examples/server/tests/requirements.txt + + - name: Download models + id: download_models + run: | + cd examples/server/tests + ../../../scripts/hf.sh --repo ggml-org/models --file tinyllamas/stories260K.gguf + + - name: Tests + id: server_integration_test + continue-on-error: ${{ matrix.experimental || matrix.arch_not_available }} + run: | + cd examples/server/tests + PORT=8888 ./tests.sh diff --git a/examples/server/README.md b/examples/server/README.md index 4b6cd8326efa8..0c43ac4c97cba 100644 --- a/examples/server/README.md +++ b/examples/server/README.md @@ -98,6 +98,12 @@ curl --request POST \ --data '{"prompt": "Building a website can be done in 10 simple steps:","n_predict": 128}' ``` +## Advanced testing + +We implemented a [server test framework](./tests/README.md) using human-readable scenario. + +*Before submitting an issue, please try to reproduce it with this format.* + ## Node JS Test You need to have [Node.js](https://nodejs.org/en) installed. diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 524d0ada33ab0..9fb436c2a18ec 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -1410,11 +1410,6 @@ struct llama_server_context int n_processing_slots = 0; for (llama_client_slot &slot: slots) { - if (slot.available()) { - n_idle_slots++; - } else { - n_processing_slots++; - } json slot_data = get_formated_generation(slot); slot_data["id"] = slot.id; slot_data["task_id"] = slot.task_id; @@ -1429,6 +1424,11 @@ struct llama_server_context {"stopped_limit", slot.stopped_limit}, {"stopping_word", slot.stopping_word}, }; + if (slot_data["state"] == IDLE) { + n_idle_slots++; + } else { + n_processing_slots++; + } slots_data.push_back(slot_data); } LOG_TEE("task %i - slots data: idle=%i processing=%i\n", task.id, n_idle_slots, n_processing_slots); @@ -2748,19 +2748,6 @@ int main(int argc, char **argv) log_data["api_key"] = "api_key: " + std::to_string(sparams.api_keys.size()) + " keys loaded"; } - LOG_INFO("HTTP server listening", log_data); - // run the HTTP server in a thread - see comment below - std::thread t([&]() - { - if (!svr.listen_after_bind()) - { - state.store(SERVER_STATE_ERROR); - return 1; - } - - return 0; - }); - // load the model if (!llama.load_model(params)) { @@ -3228,6 +3215,19 @@ int main(int argc, char **argv) }*/ //); + LOG_INFO("HTTP server listening", log_data); + // run the HTTP server in a thread - see comment below + std::thread t([&]() + { + if (!svr.listen_after_bind()) + { + state.store(SERVER_STATE_ERROR); + return 1; + } + + return 0; + }); + llama.queue_tasks.on_new_task(std::bind( &llama_server_context::process_single_task, &llama, std::placeholders::_1)); llama.queue_tasks.on_finish_multitask(std::bind( diff --git a/examples/server/tests/README.md b/examples/server/tests/README.md new file mode 100644 index 0000000000000..e44c5c286601f --- /dev/null +++ b/examples/server/tests/README.md @@ -0,0 +1,46 @@ +# Server tests + +Python based server tests scenario using [BDD](https://en.wikipedia.org/wiki/Behavior-driven_development) and [behave](https://behave.readthedocs.io/en/latest/): + * [issues.feature](./features/issues.feature) Pending issues scenario + * [parallel.feature](./features/parallel.feature) Scenario involving multi slots and concurrent requests + * [security.feature](./features/security.feature) Security, CORS and API Key + * [server.feature](./features/server.feature) Server base scenario: completion, embedding, tokenization, etc... + +Tests target GitHub workflows job runners with 4 vCPU. + +Requests are using [aiohttp](https://docs.aiohttp.org/en/stable/client_reference.html), [asyncio](https://docs.python.org/fr/3/library/asyncio.html) based http client. + +Note: If the host architecture inference speed is faster than GitHub runners one, parallel scenario may randomly fail. To mitigate it, you can increase values in `n_predict`, `kv_size`. + +### Install dependencies +`pip install -r requirements.txt` + +### Run tests +1. Build the server +```shell +cd ../../.. +mkdir build +cd build +cmake ../ +cmake --build . --target server +``` +2. download required models: + 1. `../../../scripts/hf.sh --repo ggml-org/models --file tinyllamas/stories260K.gguf` +3. Start the test: `./tests.sh` + +It's possible to override some scenario steps values with environment variables: + - `PORT` -> `context.server_port` to set the listening port of the server during scenario, default: `8080` + - `LLAMA_SERVER_BIN_PATH` -> to change the server binary path, default: `../../../build/bin/server` + - `DEBUG` -> "ON" to enable steps and server verbose mode `--verbose` + +### Run @bug, @wip or @wrong_usage annotated scenario + +Feature or Scenario must be annotated with `@llama.cpp` to be included in the default scope. +- `@bug` annotation aims to link a scenario with a GitHub issue. +- `@wrong_usage` are meant to show user issue that are actually an expected behavior +- `@wip` to focus on a scenario working in progress + +To run a scenario annotated with `@bug`, start: +`DEBUG=ON ./tests.sh --no-skipped --tags bug` + +After changing logic in `steps.py`, ensure that `@bug` and `@wrong_usage` scenario are updated. diff --git a/examples/server/tests/features/environment.py b/examples/server/tests/features/environment.py new file mode 100644 index 0000000000000..13cc841017f62 --- /dev/null +++ b/examples/server/tests/features/environment.py @@ -0,0 +1,67 @@ +import os +import socket +import subprocess +import time +from contextlib import closing +from signal import SIGKILL + + +def before_scenario(context, scenario): + print(f"\x1b[33;42mStarting new scenario: {scenario.name}!\x1b[0m") + port = 8080 + if 'PORT' in os.environ: + port = int(os.environ['PORT']) + if is_server_listening("localhost", port): + assert False, "Server already started" + + +def after_scenario(context, scenario): + if scenario.status == "failed": + if 'GITHUB_ACTIONS' in os.environ: + print(f"\x1b[33;101mSCENARIO FAILED: {scenario.name} server logs:\x1b[0m\n\n") + if os.path.isfile('llama.log'): + with closing(open('llama.log', 'r')) as f: + for line in f: + print(line) + if not is_server_listening(context.server_fqdn, context.server_port): + print("\x1b[33;101mERROR: Server stopped listening\x1b[0m") + + if not pid_exists(context.server_process.pid): + assert False, f"Server not running pid={context.server_process.pid} ..." + + print(f"stopping server pid={context.server_process.pid} ...") + context.server_process.kill() + # Wait few for socket to free up + time.sleep(0.05) + + attempts = 0 + while is_server_listening(context.server_fqdn, context.server_port): + print(f"stopping server pid={context.server_process.pid} ...") + os.kill(context.server_process.pid, SIGKILL) + time.sleep(0.1) + attempts += 1 + if attempts > 5: + print(f"Server dangling exits, killing all {context.server_path} ...") + process = subprocess.run(['killall', '-9', context.server_path], + stderr=subprocess.PIPE, + universal_newlines=True) + print(process) + + +def is_server_listening(server_fqdn, server_port): + with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as sock: + result = sock.connect_ex((server_fqdn, server_port)) + return result == 0 + + +def pid_exists(pid): + """Check whether pid exists in the current process table.""" + import errno + if pid < 0: + return False + try: + os.kill(pid, 0) + except OSError as e: + return e.errno == errno.EPERM + else: + return True diff --git a/examples/server/tests/features/issues.feature b/examples/server/tests/features/issues.feature new file mode 100644 index 0000000000000..542006d9a8df2 --- /dev/null +++ b/examples/server/tests/features/issues.feature @@ -0,0 +1,36 @@ +# List of ongoing issues +@bug +Feature: Issues + # Issue #5655 + Scenario: Multi users embeddings + Given a server listening on localhost:8080 + And a model file stories260K.gguf + And a model alias tinyllama-2 + And 42 as server seed + And 64 KV cache size + And 2 slots + And continuous batching + And embeddings extraction + Then the server is starting + Then the server is healthy + + Given a prompt: + """ + Write a very long story about AI. + """ + And a prompt: + """ + Write another very long music lyrics. + """ + And a prompt: + """ + Write a very long poem. + """ + And a prompt: + """ + Write a very long joke. + """ + Given concurrent embedding requests + Then the server is busy + Then the server is idle + Then all embeddings are generated diff --git a/examples/server/tests/features/parallel.feature b/examples/server/tests/features/parallel.feature new file mode 100644 index 0000000000000..802d624ffc9a3 --- /dev/null +++ b/examples/server/tests/features/parallel.feature @@ -0,0 +1,77 @@ +@llama.cpp +Feature: Parallel + + Background: Server startup + Given a server listening on localhost:8080 + And a model file stories260K.gguf + And a model alias tinyllama-2 + And 42 as server seed + And 64 KV cache size + And 2 slots + And continuous batching + Then the server is starting + Then the server is healthy + + Scenario Outline: Multi users completion + Given a prompt: + """ + Write a very long story about AI. + """ + And a prompt: + """ + Write another very long music lyrics. + """ + And max tokens to predict + Given concurrent completion requests + Then the server is busy + Then the server is idle + And all slots are idle + Then all prompts are predicted with tokens + Examples: + | n_predict | + | 128 | + + Scenario Outline: Multi users OAI completions compatibility + Given a system prompt You are a writer. + And a model tinyllama-2 + Given a prompt: + """ + Write a very long book. + """ + And a prompt: + """ + Write another a poem. + """ + And max tokens to predict + And streaming is + Given concurrent OAI completions requests + Then the server is busy + Then the server is idle + Then all prompts are predicted with tokens + Examples: + | streaming | n_predict | + | disabled | 128 | + | enabled | 64 | + + Scenario: Multi users with total number of tokens to predict exceeds the KV Cache size #3969 + Given a prompt: + """ + Write a very long story about AI. + """ + And a prompt: + """ + Write another very long music lyrics. + """ + And a prompt: + """ + Write a very long poem. + """ + And a prompt: + """ + Write a very long joke. + """ + And 128 max tokens to predict + Given concurrent completion requests + Then the server is busy + Then the server is idle + Then all prompts are predicted diff --git a/examples/server/tests/features/security.feature b/examples/server/tests/features/security.feature new file mode 100644 index 0000000000000..db06d39775c05 --- /dev/null +++ b/examples/server/tests/features/security.feature @@ -0,0 +1,50 @@ +@llama.cpp +Feature: Security + + Background: Server startup with an api key defined + Given a server listening on localhost:8080 + And a model file stories260K.gguf + And a server api key llama.cpp + Then the server is starting + Then the server is healthy + + Scenario Outline: Completion with some user api key + Given a prompt test + And a user api key + And 4 max tokens to predict + And a completion request with api error + + Examples: Prompts + | api_key | api_error | + | llama.cpp | no | + | llama.cpp | no | + | hackeme | raised | + | | raised | + + Scenario Outline: OAI Compatibility + Given a system prompt test + And a user prompt test + And a model test + And 2 max tokens to predict + And streaming is disabled + And a user api key + Given an OAI compatible chat completions request with api error + + Examples: Prompts + | api_key | api_error | + | llama.cpp | no | + | llama.cpp | no | + | hackme | raised | + + + Scenario Outline: CORS Options + When an OPTIONS request is sent from + Then CORS header is set to + + Examples: Headers + | origin | cors_header | cors_header_value | + | localhost | Access-Control-Allow-Origin | localhost | + | web.mydomain.fr | Access-Control-Allow-Origin | web.mydomain.fr | + | origin | Access-Control-Allow-Credentials | true | + | web.mydomain.fr | Access-Control-Allow-Methods | POST | + | web.mydomain.fr | Access-Control-Allow-Headers | * | diff --git a/examples/server/tests/features/server.feature b/examples/server/tests/features/server.feature new file mode 100644 index 0000000000000..fedcfe5aef1b3 --- /dev/null +++ b/examples/server/tests/features/server.feature @@ -0,0 +1,69 @@ +@llama.cpp +Feature: llama.cpp server + + Background: Server startup + Given a server listening on localhost:8080 + And a model file stories260K.gguf + And a model alias tinyllama-2 + And 42 as server seed + # KV Cache corresponds to the total amount of tokens + # that can be stored across all independent sequences: #4130 + # see --ctx-size and #5568 + And 32 KV cache size + And 1 slots + And embeddings extraction + And 32 server max tokens to predict + Then the server is starting + Then the server is healthy + + Scenario: Health + Then the server is ready + And all slots are idle + + Scenario Outline: Completion + Given a prompt + And max tokens to predict + And a completion request with no api error + Then tokens are predicted matching + + Examples: Prompts + | prompt | n_predict | re_content | n_predicted | + | I believe the meaning of life is | 8 | read | 8 | + | Write a joke about AI | 64 | (parkfriendsscared)+ | 32 | + + Scenario Outline: OAI Compatibility + Given a model + And a system prompt + And a user prompt + And max tokens to predict + And streaming is + Given an OAI compatible chat completions request with no api error + Then tokens are predicted matching + + Examples: Prompts + | model | system_prompt | user_prompt | max_tokens | re_content | n_predicted | enable_streaming | + | llama-2 | Book | What is the best book | 8 | (Momwhat)+ | 8 | disabled | + | codellama70b | You are a coding assistant. | Write the fibonacci function in c++. | 64 | (thankshappybird)+ | 32 | enabled | + + Scenario: Embedding + When embeddings are computed for: + """ + What is the capital of Bulgaria ? + """ + Then embeddings are generated + + Scenario: OAI Embeddings compatibility + Given a model tinyllama-2 + When an OAI compatible embeddings computation request for: + """ + What is the capital of Spain ? + """ + Then embeddings are generated + + + Scenario: Tokenize / Detokenize + When tokenizing: + """ + What is the capital of France ? + """ + Then tokens can be detokenize diff --git a/examples/server/tests/features/steps/steps.py b/examples/server/tests/features/steps/steps.py new file mode 100644 index 0000000000000..50f2b641e764e --- /dev/null +++ b/examples/server/tests/features/steps/steps.py @@ -0,0 +1,709 @@ +import asyncio +import json +import os +import re +import socket +import subprocess +import time +from contextlib import closing +from re import RegexFlag + +import aiohttp +import openai +from behave import step +from behave.api.async_step import async_run_until_complete + + +@step(u"a server listening on {server_fqdn}:{server_port}") +def step_server_config(context, server_fqdn, server_port): + context.server_fqdn = server_fqdn + context.server_port = int(server_port) + if 'PORT' in os.environ: + context.server_port = int(os.environ['PORT']) + print(f"$PORT set, overriding server port with to {context.server_port}") + + context.base_url = f'http://{context.server_fqdn}:{context.server_port}' + + context.debug = 'DEBUG' in os.environ and os.environ['DEBUG'] == 'ON' + context.model_alias = None + context.n_ctx = None + context.n_predict = None + context.n_server_predict = None + context.n_slots = None + context.server_api_key = None + context.server_continuous_batching = False + context.server_embeddings = False + context.server_seed = None + context.user_api_key = None + + context.tasks_result = [] + context.concurrent_tasks = [] + context.prompts = [] + + +@step(u'a model file {model_file}') +def step_model_file(context, model_file): + context.model_file = model_file + + +@step(u'a model alias {model_alias}') +def step_model_alias(context, model_alias): + context.model_alias = model_alias + + +@step(u'{seed} as server seed') +def step_seed(context, seed): + context.server_seed = int(seed) + + +@step(u'{n_ctx} KV cache size') +def step_n_ctx(context, n_ctx): + context.n_ctx = int(n_ctx) + + +@step(u'{n_slots} slots') +def step_n_slots(context, n_slots): + context.n_slots = int(n_slots) + + +@step(u'{n_predict} server max tokens to predict') +def step_server_n_predict(context, n_predict): + context.n_server_predict = int(n_predict) + + +@step(u'continuous batching') +def step_server_continuous_batching(context): + context.server_continuous_batching = True + + +@step(u'embeddings extraction') +def step_server_embeddings(context): + context.server_embeddings = True + + +@step(u"the server is starting") +def step_start_server(context): + start_server_background(context) + attempts = 0 + while True: + with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as sock: + result = sock.connect_ex((context.server_fqdn, context.server_port)) + if result == 0: + print("\x1b[33;46mserver started!\x1b[0m") + return + attempts += 1 + if attempts > 20: + assert False, "server not started" + print(f"waiting for server to start, connect error code = {result}...") + time.sleep(0.1) + + +@step(u"the server is {expecting_status}") +@async_run_until_complete +async def step_wait_for_the_server_to_be_started(context, expecting_status): + match expecting_status: + case 'healthy': + await wait_for_health_status(context, context.base_url, 200, 'ok') + + case 'ready' | 'idle': + await wait_for_health_status(context, context.base_url, 200, 'ok', + params={'fail_on_no_slot': 0, 'include_slots': 0}, + slots_idle=context.n_slots, + slots_processing=0, + expected_slots=[{'id': slot_id, 'state': 0} + for slot_id in range(context.n_slots)]) + case 'busy': + await wait_for_health_status(context, context.base_url, 503, + 'no slot available', + params={'fail_on_no_slot': 0, 'include_slots': 0}, + slots_idle=0, + slots_processing=context.n_slots, + expected_slots=[{'id': slot_id, 'state': 1} + for slot_id in range(context.n_slots)]) + case _: + assert False, "unknown status" + + +@step(u'all slots are {expected_slot_status_string}') +@async_run_until_complete +async def step_all_slots_status(context, expected_slot_status_string): + match expected_slot_status_string: + case 'idle': + expected_slot_status = 0 + case 'busy': + expected_slot_status = 1 + case _: + assert False, "unknown status" + + expected_slots = [{'id': slot_id, 'state': expected_slot_status} + for slot_id in range(context.n_slots)] + await request_slots_status(context, expected_slots) + + +@step(u'a completion request with {api_error} api error') +@async_run_until_complete +async def step_request_completion(context, api_error): + expect_api_error = api_error == 'raised' + completion = await request_completion(context.prompts.pop(), + context.base_url, + debug=context.debug, + n_predict=context.n_predict, + server_seed=context.server_seed, + expect_api_error=expect_api_error, + user_api_key=context.user_api_key) + context.tasks_result.append(completion) + if context.debug: + print(f"Completion response: {completion}") + if expect_api_error: + assert completion == 401, f"completion must be an 401 status code: {completion}" + + +@step(u'{predicted_n} tokens are predicted matching {re_content}') +def step_n_tokens_predicted_with_content(context, predicted_n, re_content): + assert_n_tokens_predicted(context.tasks_result.pop(), int(predicted_n), re_content) + + +@step(u'{predicted_n} tokens are predicted') +def step_n_tokens_predicted(context, predicted_n): + assert_n_tokens_predicted(context.tasks_result.pop(), int(predicted_n)) + + +@step(u'a user prompt {user_prompt}') +def step_user_prompt(context, user_prompt): + context.prompts.append(user_prompt) + + +@step(u'a system prompt {system_prompt}') +def step_system_prompt(context, system_prompt): + context.system_prompt = system_prompt + + +@step(u'a model {model}') +def step_model(context, model): + context.model = model + + +@step(u'{max_tokens} max tokens to predict') +def step_max_tokens(context, max_tokens): + context.n_predict = int(max_tokens) + + +@step(u'streaming is {enable_streaming}') +def step_streaming(context, enable_streaming): + context.enable_streaming = enable_streaming == 'enabled' + + +@step(u'a user api key {user_api_key}') +def step_user_api_key(context, user_api_key): + context.user_api_key = user_api_key + + +@step(u'no user api key') +def step_no_user_api_key(context): + context.user_api_key = None + + +@step(u'a user api key ') +def step_no_user_api_key_space(context): + context.user_api_key = None + + +@step(u'a server api key {server_api_key}') +def step_server_api_key(context, server_api_key): + context.server_api_key = server_api_key + + +@step(u'an OAI compatible chat completions request with {api_error} api error') +@async_run_until_complete +async def step_oai_chat_completions(context, api_error): + if context.debug: + print(f"Submitting OAI compatible completions request...") + expect_api_error = api_error == 'raised' + completion = await oai_chat_completions(context.prompts.pop(), + context.system_prompt, + context.base_url, + False, + model=context.model if hasattr(context, 'model') else None, + + n_predict=context.n_predict + if hasattr(context, 'n_predict') else None, + + enable_streaming=context.enable_streaming + if hasattr(context, 'enable_streaming') else None, + + server_seed=context.server_seed + if hasattr(context, 'server_seed') else None, + + user_api_key=context.user_api_key + if hasattr(context, 'user_api_key') else None, + + expect_api_error=expect_api_error) + context.tasks_result.append(completion) + if context.debug: + print(f"Completion response: {completion}") + if expect_api_error: + assert completion == 401, f"completion must be an 401 status code: {completion}" + + if context.debug: + print(f"Completion response: {completion}") + + +@step(u'a prompt') +def step_a_prompt(context): + context.prompts.append(context.text) + + +@step(u'a prompt {prompt}') +def step_a_prompt_prompt(context, prompt): + context.prompts.append(prompt) + + +@step(u'concurrent completion requests') +@async_run_until_complete() +async def step_concurrent_completion_requests(context): + await concurrent_completion_requests(context, + request_completion, + # prompt is inserted automatically + context.base_url, + debug=context.debug, + n_predict=context.n_predict if hasattr(context, 'n_predict') else None, + server_seed=context.server_seed if hasattr(context, 'server_seed') else None, + user_api_key=context.user_api_key if hasattr(context, + 'user_api_key') else None) + + +@step(u'concurrent OAI completions requests') +@async_run_until_complete +async def step_oai_chat_completions(context): + await concurrent_completion_requests(context, oai_chat_completions, + # user_prompt is inserted automatically + context.system_prompt, + context.base_url, + True, # async_client + model=context.model + if hasattr(context, 'model') else None, + n_predict=context.n_predict + if hasattr(context, 'n_predict') else None, + enable_streaming=context.enable_streaming + if hasattr(context, 'enable_streaming') else None, + server_seed=context.server_seed + if hasattr(context, 'server_seed') else None, + user_api_key=context.user_api_key + if hasattr(context, 'user_api_key') else None) + + +@step(u'all prompts are predicted') +@async_run_until_complete +async def step_all_prompts_are_predicted(context): + await all_prompts_are_predicted(context) + + +@step(u'all prompts are predicted with {n_predict} tokens') +@async_run_until_complete +async def step_all_prompts_are_predicted_with_n_tokens(context, n_predict): + expected_predicted_n = int(n_predict) + await all_prompts_are_predicted(context, expected_predicted_n) + + +async def all_prompts_are_predicted(context, expected_predicted_n=None): + n_completions = await gather_tasks_results(context) + assert n_completions > 0 + for i in range(n_completions): + assert_n_tokens_predicted(context.tasks_result.pop(), expected_predicted_n=expected_predicted_n) + assert len(context.concurrent_tasks) == 0, f"{len(context.concurrent_tasks)} pending requests" + + +@step(u'embeddings are computed for') +@async_run_until_complete +async def step_compute_embedding(context): + content = context.text + base_url = context.base_url + context.embeddings = await request_embedding(content, base_url) + + +@step(u'embeddings are generated') +def step_assert_embeddings(context): + assert_embeddings(context.embeddings) + + +@step(u'an OAI compatible embeddings computation request for') +def step_oai_compute_embedding(context): + openai.api_key = 'nope' # openai client always expects an api_keu + if context.user_api_key is not None: + openai.api_key = context.user_api_key + openai.api_base = f'{context.base_url}/v1' + embeddings = openai.Embedding.create( + model=context.model, + input=context.text, + ) + context.embeddings = embeddings + + +@step(u'concurrent embedding requests') +@async_run_until_complete() +async def step_concurrent_embedding_requests(context): + await concurrent_completion_requests(context, + request_embedding, + # prompt is inserted automatically + context.base_url) + + +@step(u'all embeddings are generated') +@async_run_until_complete() +async def all_embeddings_are_generated(context): + n_embedding_requests = await gather_tasks_results(context) + assert n_embedding_requests > 0 + for i in range(n_embedding_requests): + assert_embeddings(context.tasks_result.pop()) + + +@step(u'tokenizing') +@async_run_until_complete +async def step_tokenize(context): + context.tokenized_text = context.text + async with aiohttp.ClientSession() as session: + async with session.post(f'{context.base_url}/tokenize', + json={ + "content": context.tokenized_text, + }) as response: + assert response.status == 200 + tokenize_json = await response.json() + context.tokens = tokenize_json['tokens'] + + +@step(u'tokens can be detokenize') +@async_run_until_complete +async def step_detokenize(context): + assert len(context.tokens) > 0 + async with aiohttp.ClientSession() as session: + async with session.post(f'{context.base_url}/detokenize', + json={ + "tokens": context.tokens, + }) as response: + assert response.status == 200 + detokenize_json = await response.json() + # SPM tokenizer adds a whitespace prefix: https://github.com/google/sentencepiece/issues/15 + assert context.tokenized_text == detokenize_json['content'].strip() + + +@step(u'an OPTIONS request is sent from {origin}') +@async_run_until_complete +async def step_options_request(context, origin): + async with aiohttp.ClientSession() as session: + async with session.options(f'{context.base_url}/v1/chat/completions', + headers={"Origin": origin}) as response: + assert response.status == 200 + context.options_response = response + + +@step(u'CORS header {cors_header} is set to {cors_header_value}') +def step_check_options_header_value(context, cors_header, cors_header_value): + assert context.options_response.headers[cors_header] == cors_header_value + + +async def concurrent_completion_requests(context, f_completion, *args, **kwargs): + n_prompts = len(context.prompts) + if context.debug: + print(f"starting {n_prompts} concurrent completion requests...") + assert n_prompts > 0 + for prompt_no in range(n_prompts): + shifted_args = [context.prompts.pop(), *args] + context.concurrent_tasks.append(asyncio.create_task(f_completion(*shifted_args, **kwargs))) + await asyncio.sleep(0.1) + + +async def request_completion(prompt, + base_url, + debug=False, + n_predict=None, + server_seed=None, + expect_api_error=None, + user_api_key=None): + if debug: + print(f"Sending completion request: {prompt}") + origin = "my.super.domain" + headers = { + 'Origin': origin + } + if user_api_key is not None: + if debug: + print(f"Set user_api_key: {user_api_key}") + headers['Authorization'] = f'Bearer {user_api_key}' + + async with aiohttp.ClientSession() as session: + async with session.post(f'{base_url}/completion', + json={ + "prompt": prompt, + "n_predict": int(n_predict) if n_predict is not None else -1, + "seed": server_seed if server_seed is not None else 42 + }, + headers=headers) as response: + if expect_api_error is None or not expect_api_error: + assert response.status == 200 + assert response.headers['Access-Control-Allow-Origin'] == origin + return await response.json() + else: + return response.status + + +async def oai_chat_completions(user_prompt, + system_prompt, + base_url, + async_client, + debug=False, + model=None, + n_predict=None, + enable_streaming=None, + server_seed=None, + user_api_key=None, + expect_api_error=None): + if debug: + print(f"Sending OAI Chat completions request: {user_prompt}") + # openai client always expects an api key + user_api_key = user_api_key if user_api_key is not None else 'nope' + seed = server_seed if server_seed is not None else 42 + enable_streaming = enable_streaming if enable_streaming is not None else False + payload = { + "messages": [ + { + "role": "system", + "content": system_prompt, + }, + { + "role": "user", + "content": user_prompt, + } + ], + "model": model, + "max_tokens": n_predict, + "stream": enable_streaming, + "seed": seed + } + completion_response = { + 'content': '', + 'timings': { + 'predicted_n': 0 + } + } + if async_client: + origin = 'llama.cpp' + headers = {'Authorization': f'Bearer {user_api_key}', 'Origin': origin} + async with aiohttp.ClientSession() as session: + async with session.post(f'{base_url}/v1/chat/completions', + json=payload, + headers=headers) as response: + if enable_streaming: + assert response.status == 200 + assert response.headers['Access-Control-Allow-Origin'] == origin + assert response.headers['Content-Type'] == "text/event-stream" + event_received = True + while event_received: + event_received = False + async for line_in_bytes in response.content: + line = line_in_bytes.decode('utf8') + line = line.rstrip('\n').rstrip('\r') + if line == '': + continue + event_data = line.split(': ', 1) + assert event_data[0] == 'data', f'Bad event code received: ```{event_data}```' + chunk_raw = event_data[1] + + chunk = json.loads(chunk_raw) + assert len(chunk['choices']) == 1, f"no choices provided, line ```{line}```" + delta = chunk['choices'][0]['delta'] + if 'content' in delta: + completion_response['content'] += delta['content'] + completion_response['timings']['predicted_n'] += 1 + else: + if expect_api_error is None or not expect_api_error: + assert response.status == 200 + assert response.headers['Access-Control-Allow-Origin'] == origin + assert response.headers['Content-Type'] == "application/json; charset=utf-8" + chat_completion_raw = await response.json() + completion_response = { + 'content': chat_completion_raw['choices'][0]['message'], + 'timings': { + 'predicted_n': chat_completion_raw['usage']['completion_tokens'] + } + } + else: + return response.status + else: + try: + openai.api_key = user_api_key + openai.api_base = f'{base_url}/v1/chat' + chat_completion = openai.Completion.create( + messages=payload['messages'], + model=model, + max_tokens=n_predict, + stream=enable_streaming, + seed=seed + ) + except openai.error.APIError as e: + if expect_api_error is not None and expect_api_error: + return 401 + else: + assert False, f'error raised: {e}' + + if enable_streaming: + for chunk in chat_completion: + assert len(chunk.choices) == 1 + delta = chunk.choices[0].delta + if 'content' in delta: + completion_response['content'] += delta['content'] + completion_response['timings']['predicted_n'] += 1 + else: + assert len(chat_completion.choices) == 1 + completion_response = { + 'content': chat_completion.choices[0].message.content, + 'timings': { + 'predicted_n': chat_completion.usage.completion_tokens + } + } + if debug: + print("OAI response formatted to llama.cpp:", completion_response) + return completion_response + + +async def request_embedding(content, base_url): + async with aiohttp.ClientSession() as session: + async with session.post(f'{base_url}/embedding', + json={ + "content": content, + }) as response: + assert response.status == 200 + response_json = await response.json() + return response_json['embedding'] + + +def assert_n_tokens_predicted(completion_response, expected_predicted_n=None, re_content=None): + content = completion_response['content'] + n_predicted = completion_response['timings']['predicted_n'] + assert len(content) > 0, "no token predicted" + if expected_predicted_n is not None: + assert n_predicted == expected_predicted_n, (f'invalid number of tokens predicted:' + f' {n_predicted} <> {expected_predicted_n}') + if re_content is not None: + re_content = '^.*' + re_content.replace('', '|') + '.*$' + assert re.match(re_content, content, flags=RegexFlag.IGNORECASE | RegexFlag.MULTILINE | RegexFlag.DOTALL), ( + f'invalid tokens predicted:' + f' ```\n{content}\n``` do not match /{re_content}/') + + +async def gather_tasks_results(context): + n_tasks = len(context.concurrent_tasks) + if context.debug: + print(f"Waiting for all {n_tasks} tasks results...") + for task_no in range(n_tasks): + context.tasks_result.append(await context.concurrent_tasks.pop()) + n_completions = len(context.tasks_result) + return n_completions + + +async def wait_for_health_status(context, + base_url, + expected_http_status_code, + expected_health_status, + params=None, + slots_idle=None, + slots_processing=None, + expected_slots=None): + if context.debug: + print(f"Starting checking for health for expected_health_status={expected_health_status}") + timeout = 3 # seconds + interval = 0.5 + counter = 0 + async with aiohttp.ClientSession() as session: + while True: + async with await session.get(f'{base_url}/health', params=params) as health_response: + status_code = health_response.status + health = await health_response.json() + if context.debug: + print(f"HEALTH - response for expected health status='{expected_health_status}' on " + f"'{base_url}/health'?{params} is {health}") + if (status_code == expected_http_status_code + and health['status'] == expected_health_status + and (slots_idle is None or health['slots_idle'] == slots_idle) + and (slots_processing is None or health['slots_processing'] == slots_processing)): + if expected_slots is not None: + assert_slots_status(health['slots'], expected_slots) + return + if (status_code == expected_http_status_code + and health['status'] == expected_health_status + and (slots_idle is None or health['slots_idle'] == slots_idle) + and (slots_processing is None or health['slots_processing'] == slots_processing)): + if expected_slots is not None: + assert_slots_status(health['slots'], expected_slots) + return + await asyncio.sleep(interval) + + counter += interval + if counter >= timeout: + # Sometimes health requests are triggered after completions are predicted + if expected_http_status_code == 503: + if len(context.tasks_result) == 0: + print("\x1b[5;37;43mWARNING: forcing concurrent tasks," + " busy health check missed, probably too fast inference\x1b[0m") + n_completions = await gather_tasks_results(context) + if n_completions > 0: + return + + assert False, 'timeout exceeded' + + +def assert_embeddings(embeddings): + assert len(embeddings) > 0 + embeddings_computed = False + for emb in embeddings: + if emb != 0: + embeddings_computed = True + assert embeddings_computed, f"Embeddings: {embeddings}" + + +async def request_slots_status(context, expected_slots): + async with aiohttp.ClientSession() as session: + async with await session.get(f'{context.base_url}/slots') as slots_response: + assert slots_response.status == 200 + slots = await slots_response.json() + assert_slots_status(slots, expected_slots) + + +def assert_slots_status(slots, expected_slots): + assert len(slots) == len(expected_slots) + for slot_id, (expected, slot) in enumerate(zip(expected_slots, slots)): + for key in expected: + assert expected[key] == slot[key], (f"invalid slot {slot_id}" + f" expected[{key}] != slot[{key}]" + f" = {expected[key]} != {slot[key]}") + + +def start_server_background(context): + context.server_path = '../../../build/bin/server' + if 'LLAMA_SERVER_BIN_PATH' in os.environ: + context.server_path = os.environ['LLAMA_SERVER_BIN_PATH'] + server_args = [ + '--host', context.server_fqdn, + '--port', context.server_port, + '--model', context.model_file + ] + if context.server_continuous_batching: + server_args.append('--cont-batching') + if context.server_embeddings: + server_args.append('--embedding') + if context.model_alias is not None: + server_args.extend(['--alias', context.model_alias]) + if context.n_ctx is not None: + server_args.extend(['--ctx-size', context.n_ctx]) + if context.n_slots is not None: + server_args.extend(['--parallel', context.n_slots]) + if context.n_server_predict is not None: + server_args.extend(['--n-predict', context.n_server_predict]) + if context.server_api_key is not None: + server_args.extend(['--api-key', context.server_api_key]) + if context.debug: + server_args.append('--verbose') + print(f"starting server with: {context.server_path}", *server_args) + context.server_process = subprocess.Popen( + [str(arg) for arg in [context.server_path, *server_args]], + close_fds=True) + print(f"server pid={context.server_process.pid}") diff --git a/examples/server/tests/features/wrong_usages.feature b/examples/server/tests/features/wrong_usages.feature new file mode 100644 index 0000000000000..e228b2371ccce --- /dev/null +++ b/examples/server/tests/features/wrong_usages.feature @@ -0,0 +1,21 @@ +# run with ./test.sh --tags wrong_usage +@wrong_usage +Feature: Wrong usage of llama.cpp server + + #3969 The user must always set --n-predict option + # to cap the number of tokens any completion request can generate + # or pass n_predict/max_tokens in the request. + Scenario: Infinite loop + Given a server listening on localhost:8080 + And a model file stories260K.gguf + # Uncomment below to fix the issue + #And 64 server max tokens to predict + Then the server is starting + Given a prompt: + """ + Go to: infinite loop + """ + # Uncomment below to fix the issue + #And 128 max tokens to predict + Given concurrent completion requests + Then all prompts are predicted diff --git a/examples/server/tests/requirements.txt b/examples/server/tests/requirements.txt new file mode 100644 index 0000000000000..3e51b12dc8207 --- /dev/null +++ b/examples/server/tests/requirements.txt @@ -0,0 +1,3 @@ +aiohttp~=3.9.3 +behave~=1.2.6 +openai~=0.25.0 diff --git a/examples/server/tests/tests.sh b/examples/server/tests/tests.sh new file mode 100755 index 0000000000000..17a4e6fc64307 --- /dev/null +++ b/examples/server/tests/tests.sh @@ -0,0 +1,12 @@ +#!/bin/bash + +set -eu + +if [ $# -lt 1 ] +then + # Start @llama.cpp scenario + behave --summary --stop --no-capture --exclude 'issues|wrong_usages' --tags llama.cpp +else + behave "$@" +fi + From 4c4cb30736582cacb1a164a9d4bc8e17b1014be7 Mon Sep 17 00:00:00 2001 From: Kawrakow <48489457+ikawrakow@users.noreply.github.com> Date: Sat, 24 Feb 2024 16:23:52 +0200 Subject: [PATCH 02/18] IQ3_S: a much better alternative to Q3_K (#5676) * iq4_nl: squash commits for easier rebase * Basics (quantize, dequantize) * CUDA dequantize and dot product * Slightly faster CUDA dot product (120 t/s) * Switch to 6-bit scales * Scalar dot product * AVX2 dot product * ARM_NEON dot product * Works on metal, but still slow * Slightly better Metal dot product * Another small Metal improvement * Metal dot product is getting there * Faster CUDA dot product * Add 1/8 ffn_down layers as Q5_K when no imatrix has been provided * Report the actual bpw * Add _xs mix that is 4.05 bpw for non-MoE models * Remove IQ4_XS for now, slightly adjust kvalues_iq4nl * AVX2 dot product uses Q8_0 instead of Q8_K * Add to test-backend-ops * Minor fix * Also use use Q5_K for attn_output in MoE models * Fixes after merging latest master * Switching to blocks of 32 * AVX2 for blocks of 32 * Scaler dot product for blocks of 32 * ARM_NEON dot product for blocks of 32 * Metal kernels for blocks of 32 * Slightly faster Metal kernels * Resurrecting iq3_xs After all the experimentation, nothing was better than this. * Minor PPL improvement via a block scale fudge factor * Minor improvement via 3 neighbours * iq3_xs: working scalar and AVX2 dot products * iq3_xs: ARM_NEON dot product - works but extremely slow (10 t/s) * iq3_xs: working Metal implementation * Adding IQ3_M - IQ3_XS mix with mostly Q4_K * iiq3_xs: a 3.4375 bpw variant * iq3_xs: make CUDA work for new version * iq3_xs: make scalar and AVX2 work for new version * iq3_s: make ARM_NEON work with new version * iq3_xs: make new version work on metal Performance is very similar to Q3_K_S * iq3_xs: tiny Metal speed improvement * iq3_xs: tiny Metal speed improvement * Fix stupid warning * Q3_K_XS now uses a mix of IQ3_XS and IQ3_XXS * iq3_xs: rename to iq3_s * iq3_s: make tests pass * Move Q3_K_XS mix to 3.25 bpw * Attempt to fix failing tests * Another attempt to fix the Windows builds * Attempt to fix ROCm * ROCm again * iq3_s: partial fix for QK_K = 64 * iq3_s: make it work on metal for QK_K = 64 Pleasent surprise: the coding was super-block size independent, so all it took was to delete some QK_K == 256 guards. * Will this fix ROCm? --------- Co-authored-by: Iwan Kawrakow --- examples/quantize/quantize.cpp | 2 + ggml-cuda.cu | 171 ++++++++- ggml-metal.m | 33 +- ggml-metal.metal | 304 +++++++++++++++ ggml-quants.c | 674 +++++++++++++++++++++++++++++---- ggml-quants.h | 20 + ggml.c | 31 ++ ggml.h | 2 + llama.cpp | 50 ++- llama.h | 2 + tests/test-backend-ops.cpp | 2 +- tests/test-quantize-fns.cpp | 4 +- 12 files changed, 1211 insertions(+), 84 deletions(-) diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp index 37520857f99f7..ab7e72aaf8254 100644 --- a/examples/quantize/quantize.cpp +++ b/examples/quantize/quantize.cpp @@ -27,6 +27,8 @@ static const std::vector QUANT_OPTIONS = { { "Q2_K", LLAMA_FTYPE_MOSTLY_Q2_K, " 2.63G, +0.6717 ppl @ LLaMA-v1-7B", }, { "Q2_K_S", LLAMA_FTYPE_MOSTLY_Q2_K_S, " 2.16G, +9.0634 ppl @ LLaMA-v1-7B", }, { "IQ3_XXS",LLAMA_FTYPE_MOSTLY_IQ3_XXS," 3.06 bpw quantization", }, + { "IQ3_S", LLAMA_FTYPE_MOSTLY_IQ3_S, " 3.44 bpw quantization", }, + { "IQ3_M", LLAMA_FTYPE_MOSTLY_IQ3_M, " 3.66 bpw quantization mix", }, { "Q3_K", LLAMA_FTYPE_MOSTLY_Q3_K_M, "alias for Q3_K_M" }, { "Q3_K_XS",LLAMA_FTYPE_MOSTLY_Q3_K_XS,"3-bit extra small quantization" , }, { "Q3_K_S", LLAMA_FTYPE_MOSTLY_Q3_K_S, " 2.75G, +0.5551 ppl @ LLaMA-v1-7B", }, diff --git a/ggml-cuda.cu b/ggml-cuda.cu index b0e454e025ec4..21c612cb71b48 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -172,6 +172,7 @@ #endif typedef int8_t int8x4_t __attribute__((ext_vector_type(4))); +typedef uint8_t uint8x4_t __attribute__((ext_vector_type(4))); static __device__ __forceinline__ int __vsubss4(const int a, const int b) { const int8x4_t va = reinterpret_cast(a); const int8x4_t vb = reinterpret_cast(b); @@ -196,6 +197,18 @@ static __device__ __forceinline__ int __vsub4(const int a, const int b) { return __vsubss4(a, b); } +static __device__ __forceinline__ unsigned int __vcmpeq4(unsigned int a, unsigned int b) { + const uint8x4_t& va = reinterpret_cast(a); + const uint8x4_t& vb = reinterpret_cast(b); + unsigned int c; + uint8x4_t& vc = reinterpret_cast(c); +#pragma unroll + for (int i = 0; i < 4; ++i) { + vc[i] = va[i] == vb[i] ? 0xff : 0x00; + } + return c; +} + static __device__ __forceinline__ int __dp4a(const int a, const int b, int c) { #if defined(__gfx906__) || defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx1030__) c = __builtin_amdgcn_sdot4(a, b, c, false); @@ -518,6 +531,17 @@ typedef struct { } block_iq3_xxs; static_assert(sizeof(block_iq3_xxs) == sizeof(ggml_fp16_t) + 3*(QK_K/8), "wrong iq3_xxs block size/padding"); +#define QR3_XS 8 +#define QI3_XS (QK_K / (4*QR3_XS)) +typedef struct { + half d; + uint8_t qs[QK_K/4]; + uint8_t qh[QK_K/32]; + uint8_t signs[QK_K/8]; + uint8_t scales[QK_K/64]; +} block_iq3_s; +static_assert(sizeof(block_iq3_s) == sizeof(ggml_fp16_t) + 27*(QK_K/64), "wrong iq3_s block size/padding"); + #define QR1_S 8 #define QI1_S (QK_K / (4*QR1_S)) typedef struct { @@ -1700,6 +1724,74 @@ static const __device__ uint32_t iq3xxs_grid[256] = { 0x3e1c1c1c, 0x3e1c3404, 0x3e24140c, 0x3e24240c, 0x3e2c0404, 0x3e2c0414, 0x3e2c1424, 0x3e341c04, }; +static const __device__ uint32_t iq3xs_grid[512] = { + 0x04040404, 0x0404040c, 0x04040414, 0x0404042c, 0x0404043e, 0x04040c04, 0x04040c0c, 0x04040c14, + 0x04040c24, 0x04040c34, 0x04041404, 0x0404140c, 0x0404142c, 0x04041c1c, 0x04042404, 0x04042414, + 0x0404242c, 0x0404243e, 0x04042c0c, 0x04042c1c, 0x04043404, 0x04043414, 0x04043e0c, 0x04043e24, + 0x04043e3e, 0x040c0404, 0x040c040c, 0x040c0414, 0x040c0424, 0x040c0c04, 0x040c0c0c, 0x040c0c2c, + 0x040c1404, 0x040c141c, 0x040c143e, 0x040c1c0c, 0x040c1c2c, 0x040c2424, 0x040c340c, 0x040c342c, + 0x040c3e14, 0x04140404, 0x0414040c, 0x0414042c, 0x0414043e, 0x04140c04, 0x04140c1c, 0x04140c34, + 0x0414140c, 0x0414142c, 0x04141c04, 0x04141c24, 0x04142414, 0x0414242c, 0x0414243e, 0x04142c0c, + 0x04142c1c, 0x04143e04, 0x04143e1c, 0x041c041c, 0x041c0c0c, 0x041c0c2c, 0x041c1404, 0x041c1414, + 0x041c1c0c, 0x041c1c1c, 0x041c1c34, 0x041c2424, 0x041c2c04, 0x041c2c14, 0x041c343e, 0x041c3e0c, + 0x041c3e2c, 0x04240404, 0x04240c1c, 0x04240c3e, 0x0424140c, 0x04241424, 0x04241c14, 0x04242404, + 0x0424241c, 0x04242c0c, 0x04243e04, 0x042c0414, 0x042c0424, 0x042c1404, 0x042c1414, 0x042c1434, + 0x042c1c1c, 0x042c240c, 0x042c242c, 0x042c243e, 0x042c3434, 0x042c3e1c, 0x04340434, 0x04340c0c, + 0x04340c1c, 0x04341c0c, 0x04342c14, 0x04343e0c, 0x043e0404, 0x043e0414, 0x043e0424, 0x043e1404, + 0x043e1414, 0x043e1434, 0x043e1c1c, 0x043e2c04, 0x043e2c24, 0x0c040404, 0x0c04040c, 0x0c040414, + 0x0c040424, 0x0c040c04, 0x0c040c0c, 0x0c040c1c, 0x0c040c2c, 0x0c040c3e, 0x0c041404, 0x0c041414, + 0x0c041c0c, 0x0c041c24, 0x0c041c34, 0x0c042c24, 0x0c042c34, 0x0c04340c, 0x0c043e14, 0x0c0c0404, + 0x0c0c040c, 0x0c0c041c, 0x0c0c0434, 0x0c0c0c04, 0x0c0c0c24, 0x0c0c140c, 0x0c0c1c04, 0x0c0c1c1c, + 0x0c0c240c, 0x0c0c2c04, 0x0c0c2c14, 0x0c0c3e04, 0x0c0c3e34, 0x0c140404, 0x0c140c14, 0x0c140c2c, + 0x0c140c3e, 0x0c141404, 0x0c141424, 0x0c141c14, 0x0c142404, 0x0c14241c, 0x0c142c2c, 0x0c143404, + 0x0c143e14, 0x0c1c040c, 0x0c1c0424, 0x0c1c043e, 0x0c1c0c04, 0x0c1c0c1c, 0x0c1c140c, 0x0c1c143e, + 0x0c1c1c04, 0x0c1c1c24, 0x0c1c240c, 0x0c1c3414, 0x0c1c3e04, 0x0c24041c, 0x0c24042c, 0x0c240c14, + 0x0c240c24, 0x0c241c0c, 0x0c241c1c, 0x0c242414, 0x0c242434, 0x0c242c04, 0x0c242c24, 0x0c2c040c, + 0x0c2c0c04, 0x0c2c0c1c, 0x0c2c140c, 0x0c2c1c04, 0x0c2c1c14, 0x0c2c2c0c, 0x0c341404, 0x0c341424, + 0x0c34143e, 0x0c342424, 0x0c342434, 0x0c3e040c, 0x0c3e041c, 0x0c3e0c04, 0x0c3e0c14, 0x0c3e140c, + 0x0c3e1c2c, 0x0c3e240c, 0x0c3e3414, 0x0c3e3e04, 0x14040404, 0x1404040c, 0x1404041c, 0x1404042c, + 0x1404043e, 0x14040c04, 0x14040c14, 0x14040c24, 0x14040c34, 0x1404140c, 0x1404141c, 0x1404143e, + 0x14041c04, 0x14041c14, 0x1404240c, 0x1404241c, 0x1404242c, 0x14042c04, 0x14042c14, 0x1404343e, + 0x14043e04, 0x14043e1c, 0x14043e2c, 0x140c0404, 0x140c0414, 0x140c0c04, 0x140c0c1c, 0x140c0c3e, + 0x140c1414, 0x140c142c, 0x140c1c0c, 0x140c1c24, 0x140c2414, 0x140c2c0c, 0x1414040c, 0x14140424, + 0x1414043e, 0x1414140c, 0x1414141c, 0x14141c04, 0x14141c3e, 0x1414240c, 0x14142c1c, 0x14142c3e, + 0x14143e0c, 0x14143e24, 0x141c0404, 0x141c0414, 0x141c042c, 0x141c0c0c, 0x141c1414, 0x141c1424, + 0x141c1c0c, 0x141c1c1c, 0x141c2414, 0x141c2c04, 0x141c3434, 0x1424040c, 0x1424043e, 0x14241404, + 0x1424141c, 0x14241c14, 0x14241c2c, 0x1424240c, 0x14243e14, 0x14243e2c, 0x142c0424, 0x142c0c0c, + 0x142c1414, 0x142c1c3e, 0x142c2404, 0x142c2c1c, 0x142c3e04, 0x14340404, 0x14340414, 0x1434043e, + 0x1434140c, 0x14342c2c, 0x1434340c, 0x143e042c, 0x143e0c0c, 0x143e1434, 0x143e1c04, 0x143e241c, + 0x143e2c04, 0x1c040414, 0x1c040c0c, 0x1c040c1c, 0x1c040c2c, 0x1c040c3e, 0x1c041414, 0x1c041c0c, + 0x1c041c1c, 0x1c041c2c, 0x1c042414, 0x1c042424, 0x1c04243e, 0x1c042c0c, 0x1c04341c, 0x1c043e0c, + 0x1c0c040c, 0x1c0c041c, 0x1c0c042c, 0x1c0c0c24, 0x1c0c140c, 0x1c0c141c, 0x1c0c2404, 0x1c0c3404, + 0x1c0c3e14, 0x1c0c3e34, 0x1c140404, 0x1c140c14, 0x1c141404, 0x1c141c14, 0x1c141c24, 0x1c142c04, + 0x1c1c040c, 0x1c1c0c04, 0x1c1c0c24, 0x1c1c140c, 0x1c1c141c, 0x1c1c143e, 0x1c1c1c04, 0x1c1c240c, + 0x1c1c241c, 0x1c1c243e, 0x1c1c2c2c, 0x1c1c3e1c, 0x1c24041c, 0x1c240c0c, 0x1c240c34, 0x1c241414, + 0x1c241c0c, 0x1c242c14, 0x1c243404, 0x1c243424, 0x1c2c040c, 0x1c2c0c04, 0x1c2c0c14, 0x1c2c142c, + 0x1c2c1c14, 0x1c2c2424, 0x1c2c2c34, 0x1c2c3e1c, 0x1c340c34, 0x1c34240c, 0x1c3e040c, 0x1c3e041c, + 0x1c3e1404, 0x1c3e1414, 0x1c3e1c2c, 0x24040404, 0x24040424, 0x24040c14, 0x24041404, 0x24041424, + 0x2404143e, 0x24041c14, 0x2404240c, 0x24042c04, 0x24043e04, 0x240c0414, 0x240c043e, 0x240c0c0c, + 0x240c0c1c, 0x240c1414, 0x240c1c04, 0x240c1c2c, 0x240c241c, 0x240c2c0c, 0x240c2c2c, 0x2414040c, + 0x2414041c, 0x24140c04, 0x24140c2c, 0x2414140c, 0x24141c1c, 0x24142404, 0x24142c3e, 0x24143414, + 0x24143e04, 0x241c0424, 0x241c0c0c, 0x241c0c1c, 0x241c1404, 0x241c1414, 0x241c1c0c, 0x241c1c2c, + 0x24240404, 0x24240414, 0x24241424, 0x24241c3e, 0x24242404, 0x24243e0c, 0x242c042c, 0x242c043e, + 0x242c140c, 0x242c3414, 0x24340c1c, 0x24341c24, 0x24343404, 0x243e0c04, 0x243e0c2c, 0x243e1c04, + 0x243e241c, 0x243e2c0c, 0x2c040414, 0x2c040c04, 0x2c040c24, 0x2c041414, 0x2c042404, 0x2c042424, + 0x2c04243e, 0x2c042c14, 0x2c043434, 0x2c043e24, 0x2c0c040c, 0x2c0c041c, 0x2c0c042c, 0x2c0c0c14, + 0x2c0c140c, 0x2c0c1c14, 0x2c0c3e14, 0x2c140404, 0x2c140c0c, 0x2c14141c, 0x2c141c04, 0x2c141c34, + 0x2c142c1c, 0x2c1c0414, 0x2c1c043e, 0x2c1c0c04, 0x2c1c143e, 0x2c1c2424, 0x2c1c2c0c, 0x2c1c342c, + 0x2c1c3e1c, 0x2c24040c, 0x2c240424, 0x2c241404, 0x2c241c14, 0x2c242434, 0x2c2c0c14, 0x2c2c1434, + 0x2c2c2c0c, 0x2c2c2c1c, 0x2c342414, 0x2c3e0414, 0x2c3e0424, 0x2c3e1414, 0x34040c0c, 0x34040c1c, + 0x34040c2c, 0x34041c0c, 0x34041c1c, 0x34043404, 0x340c0404, 0x340c1404, 0x340c143e, 0x340c3424, + 0x34140c14, 0x34141c24, 0x34142414, 0x34142c2c, 0x34143414, 0x34143e04, 0x341c0404, 0x341c0c24, + 0x341c140c, 0x341c2404, 0x3424142c, 0x3424241c, 0x34243414, 0x342c0404, 0x342c041c, 0x342c1c24, + 0x342c3404, 0x3434042c, 0x34342404, 0x343e0c0c, 0x343e0c1c, 0x3e040404, 0x3e040424, 0x3e04043e, + 0x3e041404, 0x3e041414, 0x3e041c34, 0x3e042404, 0x3e042c24, 0x3e043414, 0x3e0c0414, 0x3e0c0c0c, + 0x3e0c1424, 0x3e0c241c, 0x3e0c242c, 0x3e14040c, 0x3e140424, 0x3e140c04, 0x3e140c34, 0x3e14140c, + 0x3e141c04, 0x3e142c0c, 0x3e1c0414, 0x3e1c1c14, 0x3e1c1c2c, 0x3e1c2c1c, 0x3e24040c, 0x3e24042c, + 0x3e240c1c, 0x3e241404, 0x3e242c04, 0x3e2c1414, 0x3e2c2414, 0x3e340414, 0x3e341c0c, 0x3e3e0404, +}; + + static const __device__ uint64_t iq1s_grid[512] = { 0xffffffffffff0101, 0xffffffffff01ff00, 0xffffffffff010100, 0xffffffff00000000, 0xffffffff01ff00ff, 0xffffffff01ff0001, 0xffffffff0101ffff, 0xffffffff0101ff01, @@ -1973,6 +2065,32 @@ static __global__ void dequantize_block_iq3_xxs(const void * __restrict__ vx, ds } +template +static __global__ void dequantize_block_iq3_s(const void * __restrict__ vx, dst_t * __restrict__ yy) { + + const int i = blockIdx.x; + const block_iq3_s * x = (const block_iq3_s *) vx; + + const int tid = threadIdx.x; +#if QK_K == 256 + const int il = tid/8; // 0...3 + const int ib = tid%8; // 0...7 + dst_t * y = yy + i*QK_K + 32*ib + 8*il; + const uint8_t * qs = x[i].qs + 8*ib; + const uint8_t * grid1 = (const uint8_t *)(iq3xs_grid + (qs[2*il+0] | ((x[i].qh[ib] << (8-2*il)) & 256))); + const uint8_t * grid2 = (const uint8_t *)(iq3xs_grid + (qs[2*il+1] | ((x[i].qh[ib] << (7-2*il)) & 256))); + const float d = (float)x[i].d * (0.5f + ((x[i].scales[ib/2] >> 4*(ib%2)) & 0xf)) * 0.5f; + const uint8_t signs = x[i].signs[4*ib + il]; + for (int j = 0; j < 4; ++j) { + y[j+0] = d * grid1[j] * (signs & kmask_iq2xs[j+0] ? -1.f : 1.f); + y[j+4] = d * grid2[j] * (signs & kmask_iq2xs[j+4] ? -1.f : 1.f); + } +#else + assert(false); +#endif + +} + template static __global__ void dequantize_block_iq1_s(const void * __restrict__ vx, dst_t * __restrict__ yy) { @@ -4717,6 +4835,41 @@ static __device__ __forceinline__ float vec_dot_iq3_xxs_q8_1( #endif } +// TODO: don't use lookup table for signs +static __device__ __forceinline__ float vec_dot_iq3_s_q8_1( + const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) { +#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics +#if QK_K == 256 + const block_iq3_s * bq2 = (const block_iq3_s *) vbq; + + const int ib32 = iqs; + const uint8_t * qs = bq2->qs + 8*ib32; + const int8_t * q8 = bq8_1[ib32].qs; + int sumi = 0; + for (int l = 0; l < 4; ++l) { + const uint32_t * grid1 = iq3xs_grid + (qs[2*l+0] | ((bq2->qh[ib32] << (8 - 2*l)) & 256)); + const uint32_t * grid2 = iq3xs_grid + (qs[2*l+1] | ((bq2->qh[ib32] << (7 - 2*l)) & 256)); + uint32_t signs0 = __vcmpeq4(((bq2->signs[4*ib32+l] & 0xf) * 0x01010101) & 0x08040201, 0x08040201); + uint32_t signs1 = __vcmpeq4(((bq2->signs[4*ib32+l] >> 4) * 0x01010101) & 0x08040201, 0x08040201); + const int grid_l = __vsub4(grid1[0] ^ signs0, signs0); + const int grid_h = __vsub4(grid2[0] ^ signs1, signs1); + sumi = __dp4a(grid_l, *((int *)q8+0), sumi); + sumi = __dp4a(grid_h, *((int *)q8+1), sumi); + q8 += 8; + } + const float d = (float)bq2->d * (0.5f + ((bq2->scales[ib32/2] >> 4*(ib32%2)) & 0xf)) * __low2float(bq8_1[ib32].ds) * 0.5f; + return d * sumi; +#else + assert(false); + return 0.f; +#endif +#else + assert(false); + return 0.f; +#endif +} + + static __device__ __forceinline__ float vec_dot_iq1_s_q8_1( const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) { #if QK_K == 256 @@ -6849,6 +7002,12 @@ static void dequantize_row_iq3_xxs_cuda(const void * vx, dst_t * y, const int k, dequantize_block_iq3_xxs<<>>(vx, y); } +template +static void dequantize_row_iq3_s_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) { + const int nb = k / QK_K; + dequantize_block_iq3_s<<>>(vx, y); +} + template static void dequantize_row_iq1_s_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) { const int nb = k / QK_K; @@ -6904,6 +7063,8 @@ static to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) { return dequantize_row_iq1_s_cuda; case GGML_TYPE_IQ4_NL: return dequantize_row_iq4_nl_cuda; + case GGML_TYPE_IQ3_S: + return dequantize_row_iq3_s_cuda; case GGML_TYPE_F32: return convert_unary_cuda; default: @@ -6943,6 +7104,8 @@ static to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) { return dequantize_row_iq1_s_cuda; case GGML_TYPE_IQ4_NL: return dequantize_row_iq4_nl_cuda; + case GGML_TYPE_IQ3_S: + return dequantize_row_iq3_s_cuda; case GGML_TYPE_F16: return convert_unary_cuda; default: @@ -8688,6 +8851,7 @@ static int64_t get_row_rounding(ggml_type type, const std::array= CC_RDNA2 ? 128 : 64; default: GGML_ASSERT(false); @@ -8713,6 +8877,7 @@ static int64_t get_row_rounding(ggml_type type, const std::array= CC_VOLTA ? 128 : 64; case GGML_TYPE_Q6_K: return 64; @@ -8818,6 +8983,10 @@ static void ggml_cuda_op_mul_mat_vec_q( mul_mat_vec_q_cuda (src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream); break; + case GGML_TYPE_IQ3_S: + mul_mat_vec_q_cuda + (src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream); + break; default: GGML_ASSERT(false); break; @@ -11541,7 +11710,7 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons } ggml_type a_type = a->type; if (a_type == GGML_TYPE_IQ2_XXS || a_type == GGML_TYPE_IQ2_XS || a_type == GGML_TYPE_IQ3_XXS || - a_type == GGML_TYPE_IQ1_S || a_type == GGML_TYPE_IQ4_NL) { + a_type == GGML_TYPE_IQ1_S || a_type == GGML_TYPE_IQ4_NL || a_type == GGML_TYPE_IQ3_S) { if (b->ne[1] == 1 && ggml_nrows(b) > 1) { return false; } diff --git a/ggml-metal.m b/ggml-metal.m index 0d4aa43093739..ee584cfa71ce7 100644 --- a/ggml-metal.m +++ b/ggml-metal.m @@ -61,6 +61,7 @@ GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_XXS, GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_XS, GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ3_XXS, + GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ3_S, GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ1_S, GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ4_NL, GGML_METAL_KERNEL_TYPE_GET_ROWS_I32, @@ -85,6 +86,7 @@ GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_XXS_F32, GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_XS_F32, GGML_METAL_KERNEL_TYPE_MUL_MV_IQ3_XXS_F32, + GGML_METAL_KERNEL_TYPE_MUL_MV_IQ3_S_F32, GGML_METAL_KERNEL_TYPE_MUL_MV_IQ1_S_F32, GGML_METAL_KERNEL_TYPE_MUL_MV_IQ4_NL_F32, GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F32_F32, @@ -105,6 +107,7 @@ GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ2_XXS_F32, GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ2_XS_F32, GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ3_XXS_F32, + GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ3_S_F32, GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ1_S_F32, GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ4_NL_F32, GGML_METAL_KERNEL_TYPE_MUL_MM_F32_F32, @@ -122,6 +125,7 @@ GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_XXS_F32, GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_XS_F32, GGML_METAL_KERNEL_TYPE_MUL_MM_IQ3_XXS_F32, + GGML_METAL_KERNEL_TYPE_MUL_MM_IQ3_S_F32, GGML_METAL_KERNEL_TYPE_MUL_MM_IQ1_S_F32, GGML_METAL_KERNEL_TYPE_MUL_MM_IQ4_NL_F32, GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F32_F32, @@ -139,6 +143,7 @@ GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XXS_F32, GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XS_F32, GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ3_XXS_F32, + GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ3_S_F32, GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ1_S_F32, GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_NL_F32, GGML_METAL_KERNEL_TYPE_ROPE_F32, @@ -452,6 +457,7 @@ static void ggml_metal_log(enum ggml_log_level level, const char * format, ...){ GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_XXS, get_rows_iq2_xxs, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_XS, get_rows_iq2_xs, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ3_XXS, get_rows_iq3_xxs, true); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ3_S, get_rows_iq3_s, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ1_S, get_rows_iq1_s, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ4_NL, get_rows_iq4_nl, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_I32, get_rows_i32, true); @@ -476,6 +482,7 @@ static void ggml_metal_log(enum ggml_log_level level, const char * format, ...){ GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_XXS_F32, mul_mv_iq2_xxs_f32, ctx->support_simdgroup_reduction); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_XS_F32, mul_mv_iq2_xs_f32, ctx->support_simdgroup_reduction); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ3_XXS_F32, mul_mv_iq3_xxs_f32, ctx->support_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ3_S_F32, mul_mv_iq3_s_f32, ctx->support_simdgroup_reduction); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ1_S_F32, mul_mv_iq1_s_f32, ctx->support_simdgroup_reduction); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ4_NL_F32, mul_mv_iq4_nl_f32, ctx->support_simdgroup_reduction); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F32_F32, mul_mv_id_f32_f32, ctx->support_simdgroup_reduction); @@ -496,6 +503,7 @@ static void ggml_metal_log(enum ggml_log_level level, const char * format, ...){ GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ2_XXS_F32, mul_mv_id_iq2_xxs_f32, ctx->support_simdgroup_reduction); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ2_XS_F32, mul_mv_id_iq2_xs_f32, ctx->support_simdgroup_reduction); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ3_XXS_F32, mul_mv_id_iq3_xxs_f32, ctx->support_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ3_S_F32, mul_mv_id_iq3_s_f32, ctx->support_simdgroup_reduction); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ1_S_F32, mul_mv_id_iq1_s_f32, ctx->support_simdgroup_reduction); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ4_NL_F32, mul_mv_id_iq4_nl_f32, ctx->support_simdgroup_reduction); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_F32_F32, mul_mm_f32_f32, ctx->support_simdgroup_mm); @@ -513,6 +521,7 @@ static void ggml_metal_log(enum ggml_log_level level, const char * format, ...){ GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_XXS_F32, mul_mm_iq2_xxs_f32, ctx->support_simdgroup_mm); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_XS_F32, mul_mm_iq2_xs_f32, ctx->support_simdgroup_mm); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ3_XXS_F32, mul_mm_iq3_xxs_f32, ctx->support_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ3_S_F32, mul_mm_iq3_s_f32, ctx->support_simdgroup_mm); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ1_S_F32, mul_mm_iq1_s_f32, ctx->support_simdgroup_mm); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ4_NL_F32, mul_mm_iq4_nl_f32, ctx->support_simdgroup_mm); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F32_F32, mul_mm_id_f32_f32, ctx->support_simdgroup_mm); @@ -530,6 +539,7 @@ static void ggml_metal_log(enum ggml_log_level level, const char * format, ...){ GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XXS_F32, mul_mm_id_iq2_xxs_f32, ctx->support_simdgroup_mm); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XS_F32, mul_mm_id_iq2_xs_f32, ctx->support_simdgroup_mm); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ3_XXS_F32, mul_mm_id_iq3_xxs_f32, ctx->support_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ3_S_F32, mul_mm_id_iq3_s_f32, ctx->support_simdgroup_mm); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ1_S_F32, mul_mm_id_iq1_s_f32, ctx->support_simdgroup_mm); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_NL_F32, mul_mm_id_iq4_nl_f32, ctx->support_simdgroup_mm); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ROPE_F32, rope_f32, true); @@ -1347,6 +1357,7 @@ static bool ggml_metal_graph_compute( case GGML_TYPE_IQ2_XXS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_XXS_F32].pipeline; break; case GGML_TYPE_IQ2_XS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_XS_F32 ].pipeline; break; case GGML_TYPE_IQ3_XXS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ3_XXS_F32].pipeline; break; + case GGML_TYPE_IQ3_S: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ3_S_F32 ].pipeline; break; case GGML_TYPE_IQ1_S: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ1_S_F32 ].pipeline; break; case GGML_TYPE_IQ4_NL: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ4_NL_F32 ].pipeline; break; default: GGML_ASSERT(false && "MUL MAT-MAT not implemented"); @@ -1483,6 +1494,12 @@ static bool ggml_metal_graph_compute( nth1 = 16; pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_IQ3_XXS_F32].pipeline; } break; + case GGML_TYPE_IQ3_S: + { + nth0 = 4; + nth1 = 16; + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_IQ3_S_F32].pipeline; + } break; case GGML_TYPE_IQ1_S: { nth0 = 4; @@ -1537,8 +1554,8 @@ static bool ggml_metal_graph_compute( [encoder setThreadgroupMemoryLength:mem_size atIndex:0]; [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7)/8, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; } - else if (src0t == GGML_TYPE_IQ3_XXS) { - const int mem_size = 256*4+128; + else if (src0t == GGML_TYPE_IQ3_XXS || src0t == GGML_TYPE_IQ3_S) { + const int mem_size = src0t == GGML_TYPE_IQ3_XXS ? 256*4+128 : 512*4; [encoder setThreadgroupMemoryLength:mem_size atIndex:0]; [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7)/8, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; } @@ -1640,6 +1657,7 @@ static bool ggml_metal_graph_compute( case GGML_TYPE_IQ2_XXS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XXS_F32].pipeline; break; case GGML_TYPE_IQ2_XS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XS_F32 ].pipeline; break; case GGML_TYPE_IQ3_XXS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ3_XXS_F32].pipeline; break; + case GGML_TYPE_IQ3_S: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ3_S_F32 ].pipeline; break; case GGML_TYPE_IQ1_S: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ1_S_F32 ].pipeline; break; case GGML_TYPE_IQ4_NL: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_NL_F32 ].pipeline; break; default: GGML_ASSERT(false && "MUL_MAT_ID not implemented"); @@ -1779,6 +1797,12 @@ static bool ggml_metal_graph_compute( nth1 = 16; pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ3_XXS_F32].pipeline; } break; + case GGML_TYPE_IQ3_S: + { + nth0 = 4; + nth1 = 16; + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ3_S_F32].pipeline; + } break; case GGML_TYPE_IQ1_S: { nth0 = 4; @@ -1849,8 +1873,8 @@ static bool ggml_metal_graph_compute( [encoder setThreadgroupMemoryLength:mem_size atIndex:0]; [encoder dispatchThreadgroups:MTLSizeMake((ne21 + 7)/8, _ne1, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; } - else if (src2t == GGML_TYPE_IQ3_XXS) { - const int mem_size = 256*4+128; + else if (src2t == GGML_TYPE_IQ3_XXS || src2t == GGML_TYPE_IQ3_S) { + const int mem_size = src2t == GGML_TYPE_IQ3_XXS ? 256*4+128 : 512*4; [encoder setThreadgroupMemoryLength:mem_size atIndex:0]; [encoder dispatchThreadgroups:MTLSizeMake((ne21 + 7)/8, _ne1, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; } @@ -1900,6 +1924,7 @@ static bool ggml_metal_graph_compute( case GGML_TYPE_IQ2_XXS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_XXS].pipeline; break; case GGML_TYPE_IQ2_XS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_XS ].pipeline; break; case GGML_TYPE_IQ3_XXS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ3_XXS].pipeline; break; + case GGML_TYPE_IQ3_S: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ3_S ].pipeline; break; case GGML_TYPE_IQ1_S: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ1_S ].pipeline; break; case GGML_TYPE_IQ4_NL: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ4_NL ].pipeline; break; case GGML_TYPE_I32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_I32 ].pipeline; break; diff --git a/ggml-metal.metal b/ggml-metal.metal index c223a981c246a..b3bf405391d3e 100644 --- a/ggml-metal.metal +++ b/ggml-metal.metal @@ -2525,6 +2525,20 @@ typedef struct { } block_iq3_xxs; // 98 bytes / block for QK_K = 256, so 3.0625 bpw +// 3.4375 bpw +#if QK_K == 64 +#define IQ3S_N_SCALE 2 +#else +#define IQ3S_N_SCALE QK_K/64 +#endif +typedef struct { + half d; + uint8_t qs[QK_K/4]; + uint8_t qh[QK_K/32]; + uint8_t signs[QK_K/8]; + uint8_t scales[IQ3S_N_SCALE]; +} block_iq3_s; + typedef struct { half d; uint8_t qs[QK_K/8]; @@ -3795,6 +3809,73 @@ constexpr constant static uint32_t iq3xxs_grid[256] = { 0x3e1c1c1c, 0x3e1c3404, 0x3e24140c, 0x3e24240c, 0x3e2c0404, 0x3e2c0414, 0x3e2c1424, 0x3e341c04, }; +constexpr constant static uint32_t iq3xs_grid[512] = { + 0x04040404, 0x0404040c, 0x04040414, 0x0404042c, 0x0404043e, 0x04040c04, 0x04040c0c, 0x04040c14, + 0x04040c24, 0x04040c34, 0x04041404, 0x0404140c, 0x0404142c, 0x04041c1c, 0x04042404, 0x04042414, + 0x0404242c, 0x0404243e, 0x04042c0c, 0x04042c1c, 0x04043404, 0x04043414, 0x04043e0c, 0x04043e24, + 0x04043e3e, 0x040c0404, 0x040c040c, 0x040c0414, 0x040c0424, 0x040c0c04, 0x040c0c0c, 0x040c0c2c, + 0x040c1404, 0x040c141c, 0x040c143e, 0x040c1c0c, 0x040c1c2c, 0x040c2424, 0x040c340c, 0x040c342c, + 0x040c3e14, 0x04140404, 0x0414040c, 0x0414042c, 0x0414043e, 0x04140c04, 0x04140c1c, 0x04140c34, + 0x0414140c, 0x0414142c, 0x04141c04, 0x04141c24, 0x04142414, 0x0414242c, 0x0414243e, 0x04142c0c, + 0x04142c1c, 0x04143e04, 0x04143e1c, 0x041c041c, 0x041c0c0c, 0x041c0c2c, 0x041c1404, 0x041c1414, + 0x041c1c0c, 0x041c1c1c, 0x041c1c34, 0x041c2424, 0x041c2c04, 0x041c2c14, 0x041c343e, 0x041c3e0c, + 0x041c3e2c, 0x04240404, 0x04240c1c, 0x04240c3e, 0x0424140c, 0x04241424, 0x04241c14, 0x04242404, + 0x0424241c, 0x04242c0c, 0x04243e04, 0x042c0414, 0x042c0424, 0x042c1404, 0x042c1414, 0x042c1434, + 0x042c1c1c, 0x042c240c, 0x042c242c, 0x042c243e, 0x042c3434, 0x042c3e1c, 0x04340434, 0x04340c0c, + 0x04340c1c, 0x04341c0c, 0x04342c14, 0x04343e0c, 0x043e0404, 0x043e0414, 0x043e0424, 0x043e1404, + 0x043e1414, 0x043e1434, 0x043e1c1c, 0x043e2c04, 0x043e2c24, 0x0c040404, 0x0c04040c, 0x0c040414, + 0x0c040424, 0x0c040c04, 0x0c040c0c, 0x0c040c1c, 0x0c040c2c, 0x0c040c3e, 0x0c041404, 0x0c041414, + 0x0c041c0c, 0x0c041c24, 0x0c041c34, 0x0c042c24, 0x0c042c34, 0x0c04340c, 0x0c043e14, 0x0c0c0404, + 0x0c0c040c, 0x0c0c041c, 0x0c0c0434, 0x0c0c0c04, 0x0c0c0c24, 0x0c0c140c, 0x0c0c1c04, 0x0c0c1c1c, + 0x0c0c240c, 0x0c0c2c04, 0x0c0c2c14, 0x0c0c3e04, 0x0c0c3e34, 0x0c140404, 0x0c140c14, 0x0c140c2c, + 0x0c140c3e, 0x0c141404, 0x0c141424, 0x0c141c14, 0x0c142404, 0x0c14241c, 0x0c142c2c, 0x0c143404, + 0x0c143e14, 0x0c1c040c, 0x0c1c0424, 0x0c1c043e, 0x0c1c0c04, 0x0c1c0c1c, 0x0c1c140c, 0x0c1c143e, + 0x0c1c1c04, 0x0c1c1c24, 0x0c1c240c, 0x0c1c3414, 0x0c1c3e04, 0x0c24041c, 0x0c24042c, 0x0c240c14, + 0x0c240c24, 0x0c241c0c, 0x0c241c1c, 0x0c242414, 0x0c242434, 0x0c242c04, 0x0c242c24, 0x0c2c040c, + 0x0c2c0c04, 0x0c2c0c1c, 0x0c2c140c, 0x0c2c1c04, 0x0c2c1c14, 0x0c2c2c0c, 0x0c341404, 0x0c341424, + 0x0c34143e, 0x0c342424, 0x0c342434, 0x0c3e040c, 0x0c3e041c, 0x0c3e0c04, 0x0c3e0c14, 0x0c3e140c, + 0x0c3e1c2c, 0x0c3e240c, 0x0c3e3414, 0x0c3e3e04, 0x14040404, 0x1404040c, 0x1404041c, 0x1404042c, + 0x1404043e, 0x14040c04, 0x14040c14, 0x14040c24, 0x14040c34, 0x1404140c, 0x1404141c, 0x1404143e, + 0x14041c04, 0x14041c14, 0x1404240c, 0x1404241c, 0x1404242c, 0x14042c04, 0x14042c14, 0x1404343e, + 0x14043e04, 0x14043e1c, 0x14043e2c, 0x140c0404, 0x140c0414, 0x140c0c04, 0x140c0c1c, 0x140c0c3e, + 0x140c1414, 0x140c142c, 0x140c1c0c, 0x140c1c24, 0x140c2414, 0x140c2c0c, 0x1414040c, 0x14140424, + 0x1414043e, 0x1414140c, 0x1414141c, 0x14141c04, 0x14141c3e, 0x1414240c, 0x14142c1c, 0x14142c3e, + 0x14143e0c, 0x14143e24, 0x141c0404, 0x141c0414, 0x141c042c, 0x141c0c0c, 0x141c1414, 0x141c1424, + 0x141c1c0c, 0x141c1c1c, 0x141c2414, 0x141c2c04, 0x141c3434, 0x1424040c, 0x1424043e, 0x14241404, + 0x1424141c, 0x14241c14, 0x14241c2c, 0x1424240c, 0x14243e14, 0x14243e2c, 0x142c0424, 0x142c0c0c, + 0x142c1414, 0x142c1c3e, 0x142c2404, 0x142c2c1c, 0x142c3e04, 0x14340404, 0x14340414, 0x1434043e, + 0x1434140c, 0x14342c2c, 0x1434340c, 0x143e042c, 0x143e0c0c, 0x143e1434, 0x143e1c04, 0x143e241c, + 0x143e2c04, 0x1c040414, 0x1c040c0c, 0x1c040c1c, 0x1c040c2c, 0x1c040c3e, 0x1c041414, 0x1c041c0c, + 0x1c041c1c, 0x1c041c2c, 0x1c042414, 0x1c042424, 0x1c04243e, 0x1c042c0c, 0x1c04341c, 0x1c043e0c, + 0x1c0c040c, 0x1c0c041c, 0x1c0c042c, 0x1c0c0c24, 0x1c0c140c, 0x1c0c141c, 0x1c0c2404, 0x1c0c3404, + 0x1c0c3e14, 0x1c0c3e34, 0x1c140404, 0x1c140c14, 0x1c141404, 0x1c141c14, 0x1c141c24, 0x1c142c04, + 0x1c1c040c, 0x1c1c0c04, 0x1c1c0c24, 0x1c1c140c, 0x1c1c141c, 0x1c1c143e, 0x1c1c1c04, 0x1c1c240c, + 0x1c1c241c, 0x1c1c243e, 0x1c1c2c2c, 0x1c1c3e1c, 0x1c24041c, 0x1c240c0c, 0x1c240c34, 0x1c241414, + 0x1c241c0c, 0x1c242c14, 0x1c243404, 0x1c243424, 0x1c2c040c, 0x1c2c0c04, 0x1c2c0c14, 0x1c2c142c, + 0x1c2c1c14, 0x1c2c2424, 0x1c2c2c34, 0x1c2c3e1c, 0x1c340c34, 0x1c34240c, 0x1c3e040c, 0x1c3e041c, + 0x1c3e1404, 0x1c3e1414, 0x1c3e1c2c, 0x24040404, 0x24040424, 0x24040c14, 0x24041404, 0x24041424, + 0x2404143e, 0x24041c14, 0x2404240c, 0x24042c04, 0x24043e04, 0x240c0414, 0x240c043e, 0x240c0c0c, + 0x240c0c1c, 0x240c1414, 0x240c1c04, 0x240c1c2c, 0x240c241c, 0x240c2c0c, 0x240c2c2c, 0x2414040c, + 0x2414041c, 0x24140c04, 0x24140c2c, 0x2414140c, 0x24141c1c, 0x24142404, 0x24142c3e, 0x24143414, + 0x24143e04, 0x241c0424, 0x241c0c0c, 0x241c0c1c, 0x241c1404, 0x241c1414, 0x241c1c0c, 0x241c1c2c, + 0x24240404, 0x24240414, 0x24241424, 0x24241c3e, 0x24242404, 0x24243e0c, 0x242c042c, 0x242c043e, + 0x242c140c, 0x242c3414, 0x24340c1c, 0x24341c24, 0x24343404, 0x243e0c04, 0x243e0c2c, 0x243e1c04, + 0x243e241c, 0x243e2c0c, 0x2c040414, 0x2c040c04, 0x2c040c24, 0x2c041414, 0x2c042404, 0x2c042424, + 0x2c04243e, 0x2c042c14, 0x2c043434, 0x2c043e24, 0x2c0c040c, 0x2c0c041c, 0x2c0c042c, 0x2c0c0c14, + 0x2c0c140c, 0x2c0c1c14, 0x2c0c3e14, 0x2c140404, 0x2c140c0c, 0x2c14141c, 0x2c141c04, 0x2c141c34, + 0x2c142c1c, 0x2c1c0414, 0x2c1c043e, 0x2c1c0c04, 0x2c1c143e, 0x2c1c2424, 0x2c1c2c0c, 0x2c1c342c, + 0x2c1c3e1c, 0x2c24040c, 0x2c240424, 0x2c241404, 0x2c241c14, 0x2c242434, 0x2c2c0c14, 0x2c2c1434, + 0x2c2c2c0c, 0x2c2c2c1c, 0x2c342414, 0x2c3e0414, 0x2c3e0424, 0x2c3e1414, 0x34040c0c, 0x34040c1c, + 0x34040c2c, 0x34041c0c, 0x34041c1c, 0x34043404, 0x340c0404, 0x340c1404, 0x340c143e, 0x340c3424, + 0x34140c14, 0x34141c24, 0x34142414, 0x34142c2c, 0x34143414, 0x34143e04, 0x341c0404, 0x341c0c24, + 0x341c140c, 0x341c2404, 0x3424142c, 0x3424241c, 0x34243414, 0x342c0404, 0x342c041c, 0x342c1c24, + 0x342c3404, 0x3434042c, 0x34342404, 0x343e0c0c, 0x343e0c1c, 0x3e040404, 0x3e040424, 0x3e04043e, + 0x3e041404, 0x3e041414, 0x3e041c34, 0x3e042404, 0x3e042c24, 0x3e043414, 0x3e0c0414, 0x3e0c0c0c, + 0x3e0c1424, 0x3e0c241c, 0x3e0c242c, 0x3e14040c, 0x3e140424, 0x3e140c04, 0x3e140c34, 0x3e14140c, + 0x3e141c04, 0x3e142c0c, 0x3e1c0414, 0x3e1c1c14, 0x3e1c1c2c, 0x3e1c2c1c, 0x3e24040c, 0x3e24042c, + 0x3e240c1c, 0x3e241404, 0x3e242c04, 0x3e2c1414, 0x3e2c2414, 0x3e340414, 0x3e341c0c, 0x3e3e0404, +}; + #define NGRID_IQ1S 512 constexpr constant static uint64_t iq1s_grid[NGRID_IQ1S] = { 0xffffffffffff0101, 0xffffffffff01ff00, 0xffffffffff010100, 0xffffffff00000000, @@ -4361,6 +4442,136 @@ kernel void kernel_mul_mv_iq3_xxs_f32( kernel_mul_mv_iq3_xxs_f32_impl(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3, shared_values, tgpig, tiisg, sgitg); } +void kernel_mul_mv_iq3_s_f32_impl( + device const void * src0, + device const float * src1, + device float * dst, + constant int64_t & ne00, + constant int64_t & ne01, + constant int64_t & ne02, + constant int64_t & ne10, + constant int64_t & ne12, + constant int64_t & ne0, + constant int64_t & ne1, + constant uint & r2, + constant uint & r3, + threadgroup int8_t * shared_values [[threadgroup(0)]], + uint3 tgpig[[threadgroup_position_in_grid]], + uint tiisg[[thread_index_in_simdgroup]], + uint sgitg[[simdgroup_index_in_threadgroup]]) { + + const int nb = ne00/QK_K; + const int r0 = tgpig.x; + const int r1 = tgpig.y; + const int im = tgpig.z; + + const int first_row = (r0 * N_SIMDGROUP + sgitg) * N_DST; + const int ib_row = first_row * nb; + + const uint i12 = im%ne12; + const uint i13 = im/ne12; + + const uint offset0 = (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02); + + device const block_iq3_s * x = (device const block_iq3_s *) src0 + ib_row + offset0; + device const float * y = (device const float *) src1 + r1*ne10 + im*ne00*ne1; + + float yl[32]; + float sumf[N_DST]={0.f}, all_sum; + + const int nb32 = nb * (QK_K / 32); + + threadgroup uint32_t * values = (threadgroup uint32_t *)shared_values; + { + int nval = 8; + int pos = (32*sgitg + tiisg)*nval; + for (int i = 0; i < nval; ++i) values[pos + i] = iq3xs_grid[pos + i]; + threadgroup_barrier(mem_flags::mem_threadgroup); + } + + const int ix = tiisg; + + device const float * y4 = y + 32 * ix; + + for (int ib32 = ix; ib32 < nb32; ib32 += 32) { + + for (int i = 0; i < 32; ++i) { + yl[i] = y4[i]; + } + + const int ibl = ib32 / (QK_K / 32); + const int ib = ib32 % (QK_K / 32); + + device const block_iq3_s * xr = x + ibl; + device const uint8_t * qs = xr->qs + 8 * ib; + device const uint8_t * qh = xr->qh + ib; + device const uint8_t * sc = xr->scales + (ib/2); + device const uint8_t * signs = xr->signs + 4 * ib; + device const half * dh = &xr->d; + + for (int row = 0; row < N_DST; row++) { + + const float db = dh[0]; + const float d = db * (0.5f + ((sc[0] >> 4*(ib%2)) & 0xf)); + + float2 sum = {0}; + for (int l = 0; l < 4; ++l) { + const threadgroup uint8_t * grid1 = (const threadgroup uint8_t *)(values + (qs[2*l+0] | ((qh[0] << (8-2*l)) & 256))); + const threadgroup uint8_t * grid2 = (const threadgroup uint8_t *)(values + (qs[2*l+1] | ((qh[0] << (7-2*l)) & 256))); + for (int j = 0; j < 4; ++j) { + sum[0] += yl[8*l + j + 0] * grid1[j] * select(1, -1, signs[l] & kmask_iq2xs[j+0]); + sum[1] += yl[8*l + j + 4] * grid2[j] * select(1, -1, signs[l] & kmask_iq2xs[j+4]); + } + } + sumf[row] += d * (sum[0] + sum[1]); + + dh += nb*sizeof(block_iq3_s)/2; + qs += nb*sizeof(block_iq3_s); + qh += nb*sizeof(block_iq3_s); + sc += nb*sizeof(block_iq3_s); + signs += nb*sizeof(block_iq3_s); + } + + y4 += 32 * 32; + } + + for (int row = 0; row < N_DST; ++row) { + all_sum = simd_sum(sumf[row]); + if (tiisg == 0) { + dst[r1*ne0 + im*ne0*ne1 + first_row + row] = all_sum * 0.5f; + } + } +} + +[[host_name("kernel_mul_mv_iq3_s_f32")]] +kernel void kernel_mul_mv_iq3_s_f32( + device const void * src0, + device const float * src1, + device float * dst, + constant int64_t & ne00, + constant int64_t & ne01, + constant int64_t & ne02, + constant uint64_t & nb00, + constant uint64_t & nb01, + constant uint64_t & nb02, + constant int64_t & ne10, + constant int64_t & ne11, + constant int64_t & ne12, + constant uint64_t & nb10, + constant uint64_t & nb11, + constant uint64_t & nb12, + constant int64_t & ne0, + constant int64_t & ne1, + constant uint & r2, + constant uint & r3, + threadgroup int8_t * shared_values [[threadgroup(0)]], + uint3 tgpig[[threadgroup_position_in_grid]], + uint tiisg[[thread_index_in_simdgroup]], + uint sgitg[[simdgroup_index_in_threadgroup]]) { + + kernel_mul_mv_iq3_s_f32_impl(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3, shared_values, tgpig, tiisg, sgitg); +} + void kernel_mul_mv_iq1_s_f32_impl( device const void * src0, device const float * src1, @@ -4952,6 +5163,31 @@ void dequantize_iq3_xxs(device const block_iq3_xxs * xb, short il, thread type4x } } +template +void dequantize_iq3_s(device const block_iq3_s * xb, short il, thread type4x4 & reg) { + // il is 0...15 for QK_K = 256 => index of block of 32 is il/2 + const float d = xb->d; + const int ib32 = il/2; + il = il%2; + // il = 0 or 1. il = 0 processes the first 16 quants in a block of 32, il = 1 the second 16 + device const uint8_t * qs = xb->qs + 8*ib32; + device const uint8_t * signs = xb->signs + 4*ib32 + 2*il; + const uint8_t qh = xb->qh[ib32] >> 4*il; + const float dl = d * (0.5f + ((xb->scales[ib32/2] >> 4*(ib32%2)) & 0xf)) * 0.5f; + constant uint8_t * grid1 = (constant uint8_t *)(iq3xs_grid + (qs[4*il+0] | ((qh << 8) & 256))); + constant uint8_t * grid2 = (constant uint8_t *)(iq3xs_grid + (qs[4*il+1] | ((qh << 7) & 256))); + for (int i = 0; i < 4; ++i) { + reg[0][i] = dl * grid1[i] * select(1, -1, signs[0] & kmask_iq2xs[i+0]); + reg[1][i] = dl * grid2[i] * select(1, -1, signs[0] & kmask_iq2xs[i+4]); + } + grid1 = (constant uint8_t *)(iq3xs_grid + (qs[4*il+2] | ((qh << 6) & 256))); + grid2 = (constant uint8_t *)(iq3xs_grid + (qs[4*il+3] | ((qh << 5) & 256))); + for (int i = 0; i < 4; ++i) { + reg[2][i] = dl * grid1[i] * select(1, -1, signs[1] & kmask_iq2xs[i+0]); + reg[3][i] = dl * grid2[i] * select(1, -1, signs[1] & kmask_iq2xs[i+4]); + } +} + template void dequantize_iq1_s(device const block_iq1_s * xb, short il, thread type4x4 & reg) { // il is 0...15 for QK_K = 256 => index of block of 32 is il/2 @@ -5525,6 +5761,7 @@ template [[host_name("kernel_get_rows_q6_K")]] kernel get_rows_t kernel_get_rows template [[host_name("kernel_get_rows_iq2_xxs")]] kernel get_rows_t kernel_get_rows; template [[host_name("kernel_get_rows_iq2_xs")]] kernel get_rows_t kernel_get_rows; template [[host_name("kernel_get_rows_iq3_xxs")]] kernel get_rows_t kernel_get_rows; +template [[host_name("kernel_get_rows_iq3_s")]] kernel get_rows_t kernel_get_rows; template [[host_name("kernel_get_rows_iq1_s")]] kernel get_rows_t kernel_get_rows; template [[host_name("kernel_get_rows_iq4_nl")]] kernel get_rows_t kernel_get_rows; @@ -5566,6 +5803,7 @@ template [[host_name("kernel_mul_mm_q6_K_f32")]] kernel mat_mm_t kernel_mul_mm; template [[host_name("kernel_mul_mm_iq2_xs_f32")]] kernel mat_mm_t kernel_mul_mm; template [[host_name("kernel_mul_mm_iq3_xxs_f32")]] kernel mat_mm_t kernel_mul_mm; +template [[host_name("kernel_mul_mm_iq3_s_f32")]] kernel mat_mm_t kernel_mul_mm; template [[host_name("kernel_mul_mm_iq1_s_f32")]] kernel mat_mm_t kernel_mul_mm; template [[host_name("kernel_mul_mm_iq4_nl_f32")]] kernel mat_mm_t kernel_mul_mm; @@ -5619,6 +5857,7 @@ template [[host_name("kernel_mul_mm_id_q6_K_f32")]] kernel mat_mm_id_t kernel_mu template [[host_name("kernel_mul_mm_id_iq2_xxs_f32")]] kernel mat_mm_id_t kernel_mul_mm_id; template [[host_name("kernel_mul_mm_id_iq2_xs_f32")]] kernel mat_mm_id_t kernel_mul_mm_id; template [[host_name("kernel_mul_mm_id_iq3_xxs_f32")]] kernel mat_mm_id_t kernel_mul_mm_id; +template [[host_name("kernel_mul_mm_id_iq3_s_f32")]] kernel mat_mm_id_t kernel_mul_mm_id; template [[host_name("kernel_mul_mm_id_iq1_s_f32")]] kernel mat_mm_id_t kernel_mul_mm_id; template [[host_name("kernel_mul_mm_id_iq4_nl_f32")]] kernel mat_mm_id_t kernel_mul_mm_id; @@ -6589,6 +6828,71 @@ kernel void kernel_mul_mv_id_iq3_xxs_f32( sgitg); } +[[host_name("kernel_mul_mv_id_iq3_s_f32")]] +kernel void kernel_mul_mv_id_iq3_s_f32( + device const char * ids, + device const char * src1, + device float * dst, + constant uint64_t & nbi1, + constant int64_t & ne00, + constant int64_t & ne01, + constant int64_t & ne02, + constant uint64_t & nb00, + constant uint64_t & nb01, + constant uint64_t & nb02, + constant int64_t & ne10, + constant int64_t & ne11, + constant int64_t & ne12, + constant int64_t & ne13, + constant uint64_t & nb10, + constant uint64_t & nb11, + constant uint64_t & nb12, + constant int64_t & ne0, + constant int64_t & ne1, + constant uint64_t & nb1, + constant uint & r2, + constant uint & r3, + constant int & idx, + device const char * src00, + device const char * src01, + device const char * src02, + device const char * src03, + device const char * src04, + device const char * src05, + device const char * src06, + device const char * src07, + threadgroup int8_t * shared_values [[threadgroup(0)]], + uint3 tgpig[[threadgroup_position_in_grid]], + uint tiitg[[thread_index_in_threadgroup]], + uint tiisg[[thread_index_in_simdgroup]], + uint sgitg[[simdgroup_index_in_threadgroup]]) { + device const char * src0[8] = {src00, src01, src02, src03, src04, src05, src06, src07}; + + const int64_t bid = tgpig.z/(ne12*ne13); + + tgpig.z = tgpig.z%(ne12*ne13); + + const int32_t id = ((device int32_t *) (ids + bid*nbi1))[idx]; + + kernel_mul_mv_iq3_s_f32_impl( + src0[id], + (device const float *) (src1 + bid*nb11), + dst + bid*ne0, + ne00, + ne01, + ne02, + ne10, + ne12, + ne0, + ne1, + r2, + r3, + shared_values, + tgpig, + tiisg, + sgitg); +} + [[host_name("kernel_mul_mv_id_iq1_s_f32")]] kernel void kernel_mul_mv_id_iq1_s_f32( device const char * ids, diff --git a/ggml-quants.c b/ggml-quants.c index b15977f53e2f3..5c5f2ce1b9b87 100644 --- a/ggml-quants.c +++ b/ggml-quants.c @@ -3505,6 +3505,73 @@ static const uint32_t iq3xxs_grid[256] = { 0x3e1c1c1c, 0x3e1c3404, 0x3e24140c, 0x3e24240c, 0x3e2c0404, 0x3e2c0414, 0x3e2c1424, 0x3e341c04, }; +static const uint32_t iq3xs_grid[512] = { + 0x04040404, 0x0404040c, 0x04040414, 0x0404042c, 0x0404043e, 0x04040c04, 0x04040c0c, 0x04040c14, + 0x04040c24, 0x04040c34, 0x04041404, 0x0404140c, 0x0404142c, 0x04041c1c, 0x04042404, 0x04042414, + 0x0404242c, 0x0404243e, 0x04042c0c, 0x04042c1c, 0x04043404, 0x04043414, 0x04043e0c, 0x04043e24, + 0x04043e3e, 0x040c0404, 0x040c040c, 0x040c0414, 0x040c0424, 0x040c0c04, 0x040c0c0c, 0x040c0c2c, + 0x040c1404, 0x040c141c, 0x040c143e, 0x040c1c0c, 0x040c1c2c, 0x040c2424, 0x040c340c, 0x040c342c, + 0x040c3e14, 0x04140404, 0x0414040c, 0x0414042c, 0x0414043e, 0x04140c04, 0x04140c1c, 0x04140c34, + 0x0414140c, 0x0414142c, 0x04141c04, 0x04141c24, 0x04142414, 0x0414242c, 0x0414243e, 0x04142c0c, + 0x04142c1c, 0x04143e04, 0x04143e1c, 0x041c041c, 0x041c0c0c, 0x041c0c2c, 0x041c1404, 0x041c1414, + 0x041c1c0c, 0x041c1c1c, 0x041c1c34, 0x041c2424, 0x041c2c04, 0x041c2c14, 0x041c343e, 0x041c3e0c, + 0x041c3e2c, 0x04240404, 0x04240c1c, 0x04240c3e, 0x0424140c, 0x04241424, 0x04241c14, 0x04242404, + 0x0424241c, 0x04242c0c, 0x04243e04, 0x042c0414, 0x042c0424, 0x042c1404, 0x042c1414, 0x042c1434, + 0x042c1c1c, 0x042c240c, 0x042c242c, 0x042c243e, 0x042c3434, 0x042c3e1c, 0x04340434, 0x04340c0c, + 0x04340c1c, 0x04341c0c, 0x04342c14, 0x04343e0c, 0x043e0404, 0x043e0414, 0x043e0424, 0x043e1404, + 0x043e1414, 0x043e1434, 0x043e1c1c, 0x043e2c04, 0x043e2c24, 0x0c040404, 0x0c04040c, 0x0c040414, + 0x0c040424, 0x0c040c04, 0x0c040c0c, 0x0c040c1c, 0x0c040c2c, 0x0c040c3e, 0x0c041404, 0x0c041414, + 0x0c041c0c, 0x0c041c24, 0x0c041c34, 0x0c042c24, 0x0c042c34, 0x0c04340c, 0x0c043e14, 0x0c0c0404, + 0x0c0c040c, 0x0c0c041c, 0x0c0c0434, 0x0c0c0c04, 0x0c0c0c24, 0x0c0c140c, 0x0c0c1c04, 0x0c0c1c1c, + 0x0c0c240c, 0x0c0c2c04, 0x0c0c2c14, 0x0c0c3e04, 0x0c0c3e34, 0x0c140404, 0x0c140c14, 0x0c140c2c, + 0x0c140c3e, 0x0c141404, 0x0c141424, 0x0c141c14, 0x0c142404, 0x0c14241c, 0x0c142c2c, 0x0c143404, + 0x0c143e14, 0x0c1c040c, 0x0c1c0424, 0x0c1c043e, 0x0c1c0c04, 0x0c1c0c1c, 0x0c1c140c, 0x0c1c143e, + 0x0c1c1c04, 0x0c1c1c24, 0x0c1c240c, 0x0c1c3414, 0x0c1c3e04, 0x0c24041c, 0x0c24042c, 0x0c240c14, + 0x0c240c24, 0x0c241c0c, 0x0c241c1c, 0x0c242414, 0x0c242434, 0x0c242c04, 0x0c242c24, 0x0c2c040c, + 0x0c2c0c04, 0x0c2c0c1c, 0x0c2c140c, 0x0c2c1c04, 0x0c2c1c14, 0x0c2c2c0c, 0x0c341404, 0x0c341424, + 0x0c34143e, 0x0c342424, 0x0c342434, 0x0c3e040c, 0x0c3e041c, 0x0c3e0c04, 0x0c3e0c14, 0x0c3e140c, + 0x0c3e1c2c, 0x0c3e240c, 0x0c3e3414, 0x0c3e3e04, 0x14040404, 0x1404040c, 0x1404041c, 0x1404042c, + 0x1404043e, 0x14040c04, 0x14040c14, 0x14040c24, 0x14040c34, 0x1404140c, 0x1404141c, 0x1404143e, + 0x14041c04, 0x14041c14, 0x1404240c, 0x1404241c, 0x1404242c, 0x14042c04, 0x14042c14, 0x1404343e, + 0x14043e04, 0x14043e1c, 0x14043e2c, 0x140c0404, 0x140c0414, 0x140c0c04, 0x140c0c1c, 0x140c0c3e, + 0x140c1414, 0x140c142c, 0x140c1c0c, 0x140c1c24, 0x140c2414, 0x140c2c0c, 0x1414040c, 0x14140424, + 0x1414043e, 0x1414140c, 0x1414141c, 0x14141c04, 0x14141c3e, 0x1414240c, 0x14142c1c, 0x14142c3e, + 0x14143e0c, 0x14143e24, 0x141c0404, 0x141c0414, 0x141c042c, 0x141c0c0c, 0x141c1414, 0x141c1424, + 0x141c1c0c, 0x141c1c1c, 0x141c2414, 0x141c2c04, 0x141c3434, 0x1424040c, 0x1424043e, 0x14241404, + 0x1424141c, 0x14241c14, 0x14241c2c, 0x1424240c, 0x14243e14, 0x14243e2c, 0x142c0424, 0x142c0c0c, + 0x142c1414, 0x142c1c3e, 0x142c2404, 0x142c2c1c, 0x142c3e04, 0x14340404, 0x14340414, 0x1434043e, + 0x1434140c, 0x14342c2c, 0x1434340c, 0x143e042c, 0x143e0c0c, 0x143e1434, 0x143e1c04, 0x143e241c, + 0x143e2c04, 0x1c040414, 0x1c040c0c, 0x1c040c1c, 0x1c040c2c, 0x1c040c3e, 0x1c041414, 0x1c041c0c, + 0x1c041c1c, 0x1c041c2c, 0x1c042414, 0x1c042424, 0x1c04243e, 0x1c042c0c, 0x1c04341c, 0x1c043e0c, + 0x1c0c040c, 0x1c0c041c, 0x1c0c042c, 0x1c0c0c24, 0x1c0c140c, 0x1c0c141c, 0x1c0c2404, 0x1c0c3404, + 0x1c0c3e14, 0x1c0c3e34, 0x1c140404, 0x1c140c14, 0x1c141404, 0x1c141c14, 0x1c141c24, 0x1c142c04, + 0x1c1c040c, 0x1c1c0c04, 0x1c1c0c24, 0x1c1c140c, 0x1c1c141c, 0x1c1c143e, 0x1c1c1c04, 0x1c1c240c, + 0x1c1c241c, 0x1c1c243e, 0x1c1c2c2c, 0x1c1c3e1c, 0x1c24041c, 0x1c240c0c, 0x1c240c34, 0x1c241414, + 0x1c241c0c, 0x1c242c14, 0x1c243404, 0x1c243424, 0x1c2c040c, 0x1c2c0c04, 0x1c2c0c14, 0x1c2c142c, + 0x1c2c1c14, 0x1c2c2424, 0x1c2c2c34, 0x1c2c3e1c, 0x1c340c34, 0x1c34240c, 0x1c3e040c, 0x1c3e041c, + 0x1c3e1404, 0x1c3e1414, 0x1c3e1c2c, 0x24040404, 0x24040424, 0x24040c14, 0x24041404, 0x24041424, + 0x2404143e, 0x24041c14, 0x2404240c, 0x24042c04, 0x24043e04, 0x240c0414, 0x240c043e, 0x240c0c0c, + 0x240c0c1c, 0x240c1414, 0x240c1c04, 0x240c1c2c, 0x240c241c, 0x240c2c0c, 0x240c2c2c, 0x2414040c, + 0x2414041c, 0x24140c04, 0x24140c2c, 0x2414140c, 0x24141c1c, 0x24142404, 0x24142c3e, 0x24143414, + 0x24143e04, 0x241c0424, 0x241c0c0c, 0x241c0c1c, 0x241c1404, 0x241c1414, 0x241c1c0c, 0x241c1c2c, + 0x24240404, 0x24240414, 0x24241424, 0x24241c3e, 0x24242404, 0x24243e0c, 0x242c042c, 0x242c043e, + 0x242c140c, 0x242c3414, 0x24340c1c, 0x24341c24, 0x24343404, 0x243e0c04, 0x243e0c2c, 0x243e1c04, + 0x243e241c, 0x243e2c0c, 0x2c040414, 0x2c040c04, 0x2c040c24, 0x2c041414, 0x2c042404, 0x2c042424, + 0x2c04243e, 0x2c042c14, 0x2c043434, 0x2c043e24, 0x2c0c040c, 0x2c0c041c, 0x2c0c042c, 0x2c0c0c14, + 0x2c0c140c, 0x2c0c1c14, 0x2c0c3e14, 0x2c140404, 0x2c140c0c, 0x2c14141c, 0x2c141c04, 0x2c141c34, + 0x2c142c1c, 0x2c1c0414, 0x2c1c043e, 0x2c1c0c04, 0x2c1c143e, 0x2c1c2424, 0x2c1c2c0c, 0x2c1c342c, + 0x2c1c3e1c, 0x2c24040c, 0x2c240424, 0x2c241404, 0x2c241c14, 0x2c242434, 0x2c2c0c14, 0x2c2c1434, + 0x2c2c2c0c, 0x2c2c2c1c, 0x2c342414, 0x2c3e0414, 0x2c3e0424, 0x2c3e1414, 0x34040c0c, 0x34040c1c, + 0x34040c2c, 0x34041c0c, 0x34041c1c, 0x34043404, 0x340c0404, 0x340c1404, 0x340c143e, 0x340c3424, + 0x34140c14, 0x34141c24, 0x34142414, 0x34142c2c, 0x34143414, 0x34143e04, 0x341c0404, 0x341c0c24, + 0x341c140c, 0x341c2404, 0x3424142c, 0x3424241c, 0x34243414, 0x342c0404, 0x342c041c, 0x342c1c24, + 0x342c3404, 0x3434042c, 0x34342404, 0x343e0c0c, 0x343e0c1c, 0x3e040404, 0x3e040424, 0x3e04043e, + 0x3e041404, 0x3e041414, 0x3e041c34, 0x3e042404, 0x3e042c24, 0x3e043414, 0x3e0c0414, 0x3e0c0c0c, + 0x3e0c1424, 0x3e0c241c, 0x3e0c242c, 0x3e14040c, 0x3e140424, 0x3e140c04, 0x3e140c34, 0x3e14140c, + 0x3e141c04, 0x3e142c0c, 0x3e1c0414, 0x3e1c1c14, 0x3e1c1c2c, 0x3e1c2c1c, 0x3e24040c, 0x3e24042c, + 0x3e240c1c, 0x3e241404, 0x3e242c04, 0x3e2c1414, 0x3e2c2414, 0x3e340414, 0x3e341c0c, 0x3e3e0404, +}; + #define NGRID_IQ2XXS 512 static const uint64_t iq1s_grid[NGRID_IQ2XXS] = { 0xffffffffffff0101, 0xffffffffff01ff00, 0xffffffffff010100, 0xffffffff00000000, @@ -3736,6 +3803,49 @@ void dequantize_row_iq3_xxs(const block_iq3_xxs * restrict x, float * restrict y } } +// ====================== 3.3125 bpw (de)-quantization + +void dequantize_row_iq3_s(const block_iq3_s * restrict x, float * restrict y, int k) { + assert(k % QK_K == 0); + const int nb = k / QK_K; + + for (int i = 0; i < nb; i++) { + + const float d = GGML_FP16_TO_FP32(x[i].d); + const uint8_t * qs = x[i].qs; + const uint8_t * qh = x[i].qh; + const uint8_t * signs = x[i].signs; + + for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) { + const float db1 = d * (0.5f + (x[i].scales[ib32/2] & 0xf)) * 0.5f; + const float db2 = d * (0.5f + (x[i].scales[ib32/2] >> 4)) * 0.5f; + for (int l = 0; l < 4; ++l) { + const uint8_t * grid1 = (const uint8_t *)(iq3xs_grid + (qs[2*l+0] | ((qh[0] << (8-2*l)) & 256))); + const uint8_t * grid2 = (const uint8_t *)(iq3xs_grid + (qs[2*l+1] | ((qh[0] << (7-2*l)) & 256))); + for (int j = 0; j < 4; ++j) { + y[j+0] = db1 * grid1[j] * (signs[l] & kmask_iq2xs[j+0] ? -1.f : 1.f); + y[j+4] = db1 * grid2[j] * (signs[l] & kmask_iq2xs[j+4] ? -1.f : 1.f); + } + y += 8; + } + qs += 8; + signs += 4; + for (int l = 0; l < 4; ++l) { + const uint8_t * grid1 = (const uint8_t *)(iq3xs_grid + (qs[2*l+0] | ((qh[1] << (8-2*l)) & 256))); + const uint8_t * grid2 = (const uint8_t *)(iq3xs_grid + (qs[2*l+1] | ((qh[1] << (7-2*l)) & 256))); + for (int j = 0; j < 4; ++j) { + y[j+0] = db2 * grid1[j] * (signs[l] & kmask_iq2xs[j+0] ? -1.f : 1.f); + y[j+4] = db2 * grid2[j] * (signs[l] & kmask_iq2xs[j+4] ? -1.f : 1.f); + } + y += 8; + } + qh += 2; + qs += 8; + signs += 4; + } + } +} + // ====================== 1.5625 bpw (de)-quantization void dequantize_row_iq1_s(const block_iq1_s * restrict x, float * restrict y, int k) { @@ -8806,6 +8916,7 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * r #endif +#if defined (__AVX2__) || defined (__ARM_NEON) static const int8_t keven_signs_q2xs[1024] = { 1, 1, 1, 1, 1, 1, 1, 1, -1, 1, 1, 1, 1, 1, 1, -1, 1, -1, 1, 1, 1, 1, 1, -1, -1, -1, 1, 1, 1, 1, 1, 1, 1, 1, -1, 1, 1, 1, 1, -1, -1, 1, -1, 1, 1, 1, 1, 1, 1, -1, -1, 1, 1, 1, 1, 1, -1, -1, -1, 1, 1, 1, 1, -1, @@ -8840,6 +8951,7 @@ static const int8_t keven_signs_q2xs[1024] = { 1, 1, 1, -1, -1, -1, -1, 1, -1, 1, 1, -1, -1, -1, -1, -1, 1, -1, 1, -1, -1, -1, -1, -1, -1, -1, 1, -1, -1, -1, -1, 1, 1, 1, -1, -1, -1, -1, -1, -1, -1, 1, -1, -1, -1, -1, -1, 1, 1, -1, -1, -1, -1, -1, -1, 1, -1, -1, -1, -1, -1, -1, -1, -1, }; +#endif void ggml_vec_dot_iq2_xxs_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) { assert(n % QK_K == 0); @@ -9327,6 +9439,202 @@ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * restrict s, size_t bs, const void #endif } +void ggml_vec_dot_iq3_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { + assert(n % QK_K == 0); + assert(nrc == 1); + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); + + const block_iq3_s * restrict x = vx; + const block_q8_K * restrict y = vy; + + const int nb = n / QK_K; + +#if defined(__ARM_NEON) + + static const uint8_t k_mask1[32] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03 + }; + + static const uint8_t k_mask2[16] = {0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,}; + + const uint8x16x2_t mask1 = vld1q_u8_x2(k_mask1); + const uint8x16_t mask2 = vld1q_u8(k_mask2); + + uint8x16x2_t vs; + ggml_int8x16x4_t q3s; + ggml_int8x16x4_t q8b; + + float sumf = 0; + for (int i = 0; i < nb; ++i) { + const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; + const uint8_t * restrict qs = x[i].qs; + const uint8_t * restrict qh = x[i].qh; + const uint16_t * restrict signs = (const uint16_t *)x[i].signs; + const int8_t * restrict q8 = y[i].qs; + int sumi1 = 0, sumi2 = 0; + for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) { + q8b = ggml_vld1q_s8_x4(q8); q8 += 64; + const uint32x4_t aux32x4_0 = {iq3xs_grid[qs[ 0] | ((qh[ib32+0] << 8) & 256)], iq3xs_grid[qs[ 1] | ((qh[ib32+0] << 7) & 256)], + iq3xs_grid[qs[ 2] | ((qh[ib32+0] << 6) & 256)], iq3xs_grid[qs[ 3] | ((qh[ib32+0] << 5) & 256)]}; + const uint32x4_t aux32x4_1 = {iq3xs_grid[qs[ 4] | ((qh[ib32+0] << 4) & 256)], iq3xs_grid[qs[ 5] | ((qh[ib32+0] << 3) & 256)], + iq3xs_grid[qs[ 6] | ((qh[ib32+0] << 2) & 256)], iq3xs_grid[qs[ 7] | ((qh[ib32+0] << 1) & 256)]}; + const uint32x4_t aux32x4_2 = {iq3xs_grid[qs[ 8] | ((qh[ib32+1] << 8) & 256)], iq3xs_grid[qs[ 9] | ((qh[ib32+1] << 7) & 256)], + iq3xs_grid[qs[10] | ((qh[ib32+1] << 6) & 256)], iq3xs_grid[qs[11] | ((qh[ib32+1] << 5) & 256)]}; + const uint32x4_t aux32x4_3 = {iq3xs_grid[qs[12] | ((qh[ib32+1] << 4) & 256)], iq3xs_grid[qs[13] | ((qh[ib32+1] << 3) & 256)], + iq3xs_grid[qs[14] | ((qh[ib32+1] << 2) & 256)], iq3xs_grid[qs[15] | ((qh[ib32+1] << 1) & 256)]}; + qs += 16; + + vs.val[0] = vreinterpretq_u8_u32(vdupq_n_u32(signs[0] | (signs[1] << 16))); + vs.val[1] = vandq_u8(vqtbl1q_u8(vs.val[0], mask1.val[1]), mask2); + vs.val[0] = vandq_u8(vqtbl1q_u8(vs.val[0], mask1.val[0]), mask2); + vs.val[0] = vceqq_u8(vs.val[0], mask2); + vs.val[1] = vceqq_u8(vs.val[1], mask2); + + q3s.val[0] = vsubq_s8(vreinterpretq_s8_u8(veorq_u8(vs.val[0], vreinterpretq_u8_u32(aux32x4_0))), vreinterpretq_s8_u8(vs.val[0])); + q3s.val[1] = vsubq_s8(vreinterpretq_s8_u8(veorq_u8(vs.val[1], vreinterpretq_u8_u32(aux32x4_1))), vreinterpretq_s8_u8(vs.val[1])); + + vs.val[0] = vreinterpretq_u8_u32(vdupq_n_u32(signs[2] | (signs[3] << 16))); + vs.val[1] = vandq_u8(vqtbl1q_u8(vs.val[0], mask1.val[1]), mask2); + vs.val[0] = vandq_u8(vqtbl1q_u8(vs.val[0], mask1.val[0]), mask2); + vs.val[0] = vceqq_u8(vs.val[0], mask2); + vs.val[1] = vceqq_u8(vs.val[1], mask2); + + signs += 4; + + q3s.val[2] = vsubq_s8(vreinterpretq_s8_u8(veorq_u8(vs.val[0], vreinterpretq_u8_u32(aux32x4_2))), vreinterpretq_s8_u8(vs.val[0])); + q3s.val[3] = vsubq_s8(vreinterpretq_s8_u8(veorq_u8(vs.val[1], vreinterpretq_u8_u32(aux32x4_3))), vreinterpretq_s8_u8(vs.val[1])); + + const int32x4_t p1 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q3s.val[0], q8b.val[0]), q3s.val[1], q8b.val[1]); + const int32x4_t p2 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q3s.val[2], q8b.val[2]), q3s.val[3], q8b.val[3]); + sumi1 += vaddvq_s32(p1) * (1 + 2*(x[i].scales[ib32/2] & 0xf)); + sumi2 += vaddvq_s32(p2) * (1 + 2*(x[i].scales[ib32/2] >> 4)); + } + sumf += d*(sumi1 + sumi2); + } + *s = 0.25f * sumf; + +#elif defined(__AVX2__) + + static const uint8_t k_mask1[32] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03 + }; + + static const uint8_t k_mask2[32] = {0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, + 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, + }; + + const __m256i mask1 = _mm256_loadu_si256((const __m256i*)k_mask1); + const __m256i mask2 = _mm256_loadu_si256((const __m256i*)k_mask2); + + __m256 accumf = _mm256_setzero_ps(); + for (int i = 0; i < nb; ++i) { + const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; + const uint8_t * restrict qs = x[i].qs; + const uint8_t * restrict qh = x[i].qh; + const uint16_t * restrict signs = (const uint16_t *)x[i].signs; + const int8_t * restrict q8 = y[i].qs; + __m256i sumi1 = _mm256_setzero_si256(); + __m256i sumi2 = _mm256_setzero_si256(); + for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) { + const __m256i q8_1 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32; + const __m256i q8_2 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32; + const __m256i q2_1 = _mm256_set_epi32(iq3xs_grid[qs[7] | ((qh[ib32+0] << 1) & 256)], + iq3xs_grid[qs[6] | ((qh[ib32+0] << 2) & 256)], + iq3xs_grid[qs[5] | ((qh[ib32+0] << 3) & 256)], + iq3xs_grid[qs[4] | ((qh[ib32+0] << 4) & 256)], + iq3xs_grid[qs[3] | ((qh[ib32+0] << 5) & 256)], + iq3xs_grid[qs[2] | ((qh[ib32+0] << 6) & 256)], + iq3xs_grid[qs[1] | ((qh[ib32+0] << 7) & 256)], + iq3xs_grid[qs[0] | ((qh[ib32+0] << 8) & 256)]); + qs += 8; + const __m256i q2_2 = _mm256_set_epi32(iq3xs_grid[qs[7] | ((qh[ib32+1] << 1) & 256)], + iq3xs_grid[qs[6] | ((qh[ib32+1] << 2) & 256)], + iq3xs_grid[qs[5] | ((qh[ib32+1] << 3) & 256)], + iq3xs_grid[qs[4] | ((qh[ib32+1] << 4) & 256)], + iq3xs_grid[qs[3] | ((qh[ib32+1] << 5) & 256)], + iq3xs_grid[qs[2] | ((qh[ib32+1] << 6) & 256)], + iq3xs_grid[qs[1] | ((qh[ib32+1] << 7) & 256)], + iq3xs_grid[qs[0] | ((qh[ib32+1] << 8) & 256)]); + qs += 8; + + __m256i aux256 = _mm256_set1_epi32(signs[0] | (signs[1] << 16)); + aux256 = _mm256_and_si256(_mm256_shuffle_epi8(aux256,mask1), mask2); + const __m256i s2_1 = _mm256_cmpeq_epi8(aux256, mask2); + const __m256i q8s_1 = _mm256_sub_epi8(_mm256_xor_si256(s2_1, q8_1), s2_1); + + aux256 = _mm256_set1_epi32(signs[2] | (signs[3] << 16)); + aux256 = _mm256_and_si256(_mm256_shuffle_epi8(aux256,mask1), mask2); + const __m256i s2_2 = _mm256_cmpeq_epi8(aux256, mask2); + const __m256i q8s_2 = _mm256_sub_epi8(_mm256_xor_si256(s2_2, q8_2), s2_2); + + signs += 4; + + const __m256i dot1 = _mm256_maddubs_epi16(q2_1, q8s_1); + const __m256i dot2 = _mm256_maddubs_epi16(q2_2, q8s_2); + const uint16_t ls1 = x[i].scales[ib32/2] & 0xf; + const uint16_t ls2 = x[i].scales[ib32/2] >> 4; + const __m256i p1 = _mm256_madd_epi16(dot1, _mm256_set1_epi16(2*ls1+1)); + const __m256i p2 = _mm256_madd_epi16(dot2, _mm256_set1_epi16(2*ls2+1)); + sumi1 = _mm256_add_epi32(sumi1, p1); + sumi2 = _mm256_add_epi32(sumi2, p2); + } + + accumf = _mm256_fmadd_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(_mm256_add_epi32(sumi1, sumi2)), accumf); + + } + + *s = 0.25f * hsum_float_8(accumf); + +#else + + float sumf = 0.f; + for (int i = 0; i < nb; ++i) { + const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; + const uint8_t * restrict qs = x[i].qs; + const uint8_t * restrict qh = x[i].qh; + const uint8_t * restrict signs = x[i].signs; + const int8_t * restrict q8 = y[i].qs; + int32_t bsum = 0; + for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) { + const uint32_t ls1 = 2*(x[i].scales[ib32/2] & 0xf) + 1; + const uint32_t ls2 = 2*(x[i].scales[ib32/2] >> 4) + 1; + int32_t sumi = 0; + for (int l = 0; l < 4; ++l) { + const uint8_t * grid1 = (const uint8_t *)(iq3xs_grid + (qs[2*l+0] | ((qh[ib32+0] << (8-2*l)) & 256))); + const uint8_t * grid2 = (const uint8_t *)(iq3xs_grid + (qs[2*l+1] | ((qh[ib32+0] << (7-2*l)) & 256))); + for (int j = 0; j < 4; ++j) { + sumi += grid1[j] * q8[j+0] * (signs[l] & kmask_iq2xs[j+0] ? -1 : 1); + sumi += grid2[j] * q8[j+4] * (signs[l] & kmask_iq2xs[j+4] ? -1 : 1); + } + q8 += 8; + } + qs += 8; + signs += 4; + bsum += sumi * ls1; + sumi = 0; + for (int l = 0; l < 4; ++l) { + const uint8_t * grid1 = (const uint8_t *)(iq3xs_grid + (qs[2*l+0] | ((qh[ib32+1] << (8-2*l)) & 256))); + const uint8_t * grid2 = (const uint8_t *)(iq3xs_grid + (qs[2*l+1] | ((qh[ib32+1] << (7-2*l)) & 256))); + for (int j = 0; j < 4; ++j) { + sumi += grid1[j] * q8[j+0] * (signs[l] & kmask_iq2xs[j+0] ? -1 : 1); + sumi += grid2[j] * q8[j+4] * (signs[l] & kmask_iq2xs[j+4] ? -1 : 1); + } + q8 += 8; + } + qs += 8; + signs += 4; + bsum += sumi * ls2; + } + sumf += d * bsum; + } + *s = 0.25f * sumf; +#endif +} + + #ifdef __AVX2__ static inline __m256i mul_add_epi8(const __m256i x, const __m256i y) { const __m256i ax = _mm256_sign_epi8(x, x); @@ -9523,6 +9831,7 @@ void ggml_vec_dot_iq4_nl_q8_0(int n, float * restrict s, size_t bs, const void * float sumf = 0; for (int ib = 0; ib < nb; ib += 2) { + q4bits.val[0] = vld1q_u8(x[ib+0].qs); q4bits.val[1] = vld1q_u8(x[ib+1].qs); q8b.val[0] = vld1q_s8(y[ib+0].qs); @@ -10239,14 +10548,15 @@ typedef struct { uint16_t * neighbours; } iq3_entry_t; -static iq3_entry_t iq3_data[1] = { +static iq3_entry_t iq3_data[2] = { + {NULL, NULL, NULL}, {NULL, NULL, NULL}, }; static inline int iq3_data_index(int grid_size) { (void)grid_size; - GGML_ASSERT(grid_size == 256); - return 0; + GGML_ASSERT(grid_size == 256 || grid_size == 512); + return grid_size == 256 ? 0 : 1; } static int iq3_compare_func(const void * left, const void * right) { @@ -10278,9 +10588,44 @@ void iq3xs_init_impl(int grid_size) { 3185, 3215, 3252, 3288, 3294, 3364, 3397, 3434, 3483, 3523, 3537, 3587, 3589, 3591, 3592, 3610, 3626, 3670, 3680, 3722, 3749, 3754, 3776, 3789, 3803, 3824, 3857, 3873, 3904, 3906, 3924, 3992, }; + static const uint16_t kgrid_512[512] = { + 0, 1, 2, 5, 7, 8, 9, 10, 12, 14, 16, 17, 21, 27, 32, 34, + 37, 39, 41, 43, 48, 50, 57, 60, 63, 64, 65, 66, 68, 72, 73, 77, + 80, 83, 87, 89, 93, 100, 113, 117, 122, 128, 129, 133, 135, 136, 139, 142, + 145, 149, 152, 156, 162, 165, 167, 169, 171, 184, 187, 195, 201, 205, 208, 210, + 217, 219, 222, 228, 232, 234, 247, 249, 253, 256, 267, 271, 273, 276, 282, 288, + 291, 297, 312, 322, 324, 336, 338, 342, 347, 353, 357, 359, 374, 379, 390, 393, + 395, 409, 426, 441, 448, 450, 452, 464, 466, 470, 475, 488, 492, 512, 513, 514, + 516, 520, 521, 523, 525, 527, 528, 530, 537, 540, 542, 556, 558, 561, 570, 576, + 577, 579, 582, 584, 588, 593, 600, 603, 609, 616, 618, 632, 638, 640, 650, 653, + 655, 656, 660, 666, 672, 675, 685, 688, 698, 705, 708, 711, 712, 715, 721, 727, + 728, 732, 737, 754, 760, 771, 773, 778, 780, 793, 795, 802, 806, 808, 812, 833, + 840, 843, 849, 856, 858, 873, 912, 916, 919, 932, 934, 961, 963, 968, 970, 977, + 989, 993, 1010, 1016, 1024, 1025, 1027, 1029, 1031, 1032, 1034, 1036, 1038, 1041, 1043, 1047, + 1048, 1050, 1057, 1059, 1061, 1064, 1066, 1079, 1080, 1083, 1085, 1088, 1090, 1096, 1099, 1103, + 1106, 1109, 1113, 1116, 1122, 1129, 1153, 1156, 1159, 1169, 1171, 1176, 1183, 1185, 1195, 1199, + 1209, 1212, 1216, 1218, 1221, 1225, 1234, 1236, 1241, 1243, 1250, 1256, 1270, 1281, 1287, 1296, + 1299, 1306, 1309, 1313, 1338, 1341, 1348, 1353, 1362, 1375, 1376, 1387, 1400, 1408, 1410, 1415, + 1425, 1453, 1457, 1477, 1481, 1494, 1496, 1507, 1512, 1538, 1545, 1547, 1549, 1551, 1554, 1561, + 1563, 1565, 1570, 1572, 1575, 1577, 1587, 1593, 1601, 1603, 1605, 1612, 1617, 1619, 1632, 1648, + 1658, 1662, 1664, 1674, 1680, 1690, 1692, 1704, 1729, 1736, 1740, 1745, 1747, 1751, 1752, 1761, + 1763, 1767, 1773, 1787, 1795, 1801, 1806, 1810, 1817, 1834, 1840, 1844, 1857, 1864, 1866, 1877, + 1882, 1892, 1902, 1915, 1934, 1953, 1985, 1987, 2000, 2002, 2013, 2048, 2052, 2058, 2064, 2068, + 2071, 2074, 2081, 2088, 2104, 2114, 2119, 2121, 2123, 2130, 2136, 2141, 2147, 2153, 2157, 2177, + 2179, 2184, 2189, 2193, 2203, 2208, 2223, 2226, 2232, 2244, 2249, 2251, 2256, 2258, 2265, 2269, + 2304, 2306, 2324, 2335, 2336, 2361, 2373, 2375, 2385, 2418, 2443, 2460, 2480, 2504, 2509, 2520, + 2531, 2537, 2562, 2568, 2572, 2578, 2592, 2596, 2599, 2602, 2614, 2620, 2625, 2627, 2629, 2634, + 2641, 2650, 2682, 2688, 2697, 2707, 2712, 2718, 2731, 2754, 2759, 2760, 2775, 2788, 2793, 2805, + 2811, 2817, 2820, 2832, 2842, 2854, 2890, 2902, 2921, 2923, 2978, 3010, 3012, 3026, 3081, 3083, + 3085, 3097, 3099, 3120, 3136, 3152, 3159, 3188, 3210, 3228, 3234, 3245, 3250, 3256, 3264, 3276, + 3281, 3296, 3349, 3363, 3378, 3392, 3395, 3420, 3440, 3461, 3488, 3529, 3531, 3584, 3588, 3591, + 3600, 3602, 3614, 3616, 3628, 3634, 3650, 3657, 3668, 3683, 3685, 3713, 3716, 3720, 3726, 3729, + 3736, 3753, 3778, 3802, 3805, 3819, 3841, 3845, 3851, 3856, 3880, 3922, 3938, 3970, 3993, 4032, + }; + const int kmap_size = 4096; - const int nwant = 2; - const uint16_t * kgrid = kgrid_256; + const int nwant = grid_size == 256 ? 2 : 3; + const uint16_t * kgrid = grid_size == 256 ? kgrid_256 : kgrid_512; uint32_t * kgrid_q3xs; int * kmap_q3xs; uint16_t * kneighbors_q3xs; @@ -10377,7 +10722,7 @@ void iq3xs_init_impl(int grid_size) { } void iq3xs_free_impl(int grid_size) { - GGML_ASSERT(grid_size == 256); + GGML_ASSERT(grid_size == 256 || grid_size == 512); const int gindex = iq3_data_index(grid_size); if (iq3_data[gindex].grid) { free(iq3_data[gindex].grid); iq3_data[gindex].grid = NULL; @@ -10410,9 +10755,10 @@ static int iq3_find_best_neighbour(const uint16_t * restrict neighbours, const u return grid_index; } -static void quantize_row_iq3_xxs_impl(const float * restrict x, void * restrict vy, int n, const float * restrict quant_weights) { +static void quantize_row_iq3_xxs_impl(int grid_size, const float * restrict x, void * restrict vy, int n, + const float * restrict quant_weights) { - const int gindex = iq3_data_index(256); + const int gindex = iq3_data_index(grid_size); const uint32_t * kgrid_q3xs = iq3_data[gindex].grid; const int * kmap_q3xs = iq3_data[gindex].map; @@ -10426,9 +10772,23 @@ static void quantize_row_iq3_xxs_impl(const float * restrict x, void * restrict const int kMaxQ = 8; - const int nbl = n/256; + const int nbl = n/QK_K; - block_iq3_xxs * y = vy; + ggml_fp16_t * dh; + uint8_t * qs; + int block_size; + if (grid_size == 256) { + block_iq3_xxs * y = vy; + dh = &y->d; + qs = y->qs; + block_size = sizeof(block_iq3_xxs); + } else { + block_iq3_s * y = vy; + dh = &y->d; + qs = y->qs; + block_size = sizeof(block_iq3_s); + } + int quant_size = block_size - sizeof(ggml_fp16_t); float scales[QK_K/32]; float weight[32]; @@ -10439,20 +10799,21 @@ static void quantize_row_iq3_xxs_impl(const float * restrict x, void * restrict bool is_on_grid[8]; bool is_on_grid_aux[8]; uint8_t block_signs[8]; - uint8_t q3[3*(QK_K/8)]; + uint8_t q3[3*(QK_K/8)+QK_K/32]; uint32_t * scales_and_signs = (uint32_t *)(q3 + QK_K/4); + uint8_t * qh = q3 + 3*(QK_K/8); for (int ibl = 0; ibl < nbl; ++ibl) { - y[ibl].d = GGML_FP32_TO_FP16(0.f); - memset(q3, 0, 3*QK_K/8); + dh[0] = GGML_FP32_TO_FP16(0.f); + memset(q3, 0, 3*QK_K/8+QK_K/32); float max_scale = 0; const float * xbl = x + QK_K*ibl; float sumx2 = 0; for (int i = 0; i < QK_K; ++i) sumx2 += xbl[i]*xbl[i]; - float sigma2 = sumx2/QK_K; + float sigma2 = 2*sumx2/QK_K; for (int ib = 0; ib < QK_K/32; ++ib) { const float * xb = xbl + 32*ib; @@ -10570,7 +10931,13 @@ static void quantize_row_iq3_xxs_impl(const float * restrict x, void * restrict printf("\n"); GGML_ASSERT(false); } - q3[8*ib+k] = grid_index; + if (grid_size == 256) { + q3[8*ib+k] = grid_index; + } else { + q3[8*ib+k] = grid_index & 255; + qh[ib] |= ((grid_index >> 8) << k); + } + } scales_and_signs[ib] = block_signs[0] | (block_signs[1] << 7) | (block_signs[2] << 14) | (block_signs[3] << 21); GGML_ASSERT(scale >= 0); @@ -10579,63 +10946,25 @@ static void quantize_row_iq3_xxs_impl(const float * restrict x, void * restrict } if (!max_scale) { - memset(y[ibl].qs, 0, 3*QK_K/8); + memset(qs, 0, quant_size); + dh += block_size/sizeof(ggml_fp16_t); + qs += block_size; continue; } float d = max_scale/31; - y[ibl].d = GGML_FP32_TO_FP16(d); + dh[0] = GGML_FP32_TO_FP16(d * 1.0125f); // small improvement via this fudge factor float id = 1/d; - float sumqx = 0, sumq2 = 0; for (int ib = 0; ib < QK_K/32; ++ib) { int l = nearest_int(0.5f*(id*scales[ib]-1)); l = MAX(0, MIN(15, l)); scales_and_signs[ib] |= ((uint32_t)l << 28); - if (false) { - const float * xb = xbl + 32*ib; - if (quant_weights) { - const float * qw = quant_weights + QK_K*ibl + 32*ib; - for (int i = 0; i < 32; ++i) weight[i] = qw[i] * sqrtf(sigma2 + xb[i]*xb[i]); - } else { - for (int i = 0; i < 32; ++i) weight[i] = xb[i]*xb[i]; - } - const float db = 0.25f * d * (1 + 2*l); - for (int k = 0; k < 8; ++k) { - const int8_t * signs = keven_signs_q2xs + 8*((scales_and_signs[ib] >> 7*(k/2)) & 127) + 4*(k%2); - const float * xk = xb + 4*k; - const float * wk = weight + 4*k; - //const uint8_t * grid = (const uint8_t *)(kgrid_q3xs + q3[8*ib+k]); - const uint8_t * grid = (const uint8_t *)(iq3xxs_grid + q3[8*ib+k]); - float best_mse = 0; int best_index = q3[8*ib+k]; - for (int j = 0; j < 4; ++j) { - float diff = db * grid[j] * signs[j] - xk[j]; - best_mse += wk[j] * diff * diff; - } - for (int idx = 0; idx < 256; ++idx) { - //grid = (const uint8_t *)(kgrid_q3xs + idx); - grid = (const uint8_t *)(iq3xxs_grid + idx); - float mse = 0; - for (int j = 0; j < 4; ++j) { - float diff = db * grid[j] * signs[j] - xk[j]; - mse += wk[j] * diff * diff; - } - if (mse < best_mse) { - best_mse = mse; best_index = idx; - } - } - q3[8*ib+k] = best_index; - //grid = (const uint8_t *)(kgrid_q3xs + best_index); - grid = (const uint8_t *)(iq3xxs_grid + best_index); - for (int j = 0; j < 4; ++j) { - float q = db * grid[j] * signs[j]; - sumqx += wk[j] * q * xk[j]; - sumq2 += wk[j] * q * q; - } - } - if (sumq2 > 0) y[ibl].d = GGML_FP32_TO_FP16(d*sumqx/sumq2); - } } - memcpy(y[ibl].qs, q3, 3*QK_K/8); + memcpy(qs, q3, quant_size); + + dh += block_size/sizeof(ggml_fp16_t); + qs += block_size; + } } @@ -10645,7 +10974,7 @@ size_t quantize_iq3_xxs(const float * src, void * dst, int nrow, int n_per_row, int nblock = n_per_row/QK_K; char * qrow = (char *)dst; for (int row = 0; row < nrow; ++row) { - quantize_row_iq3_xxs_impl(src, qrow, n_per_row, quant_weights); + quantize_row_iq3_xxs_impl(256, src, qrow, n_per_row, quant_weights); src += n_per_row; qrow += nblock*sizeof(block_iq3_xxs); } @@ -10660,9 +10989,226 @@ void quantize_row_iq3_xxs(const float * restrict x, void * restrict vy, int k) { void quantize_row_iq3_xxs_reference(const float * restrict x, block_iq3_xxs * restrict y, int k) { assert(k % QK_K == 0); - quantize_row_iq3_xxs_impl(x, y, k, NULL); + quantize_row_iq3_xxs_impl(256, x, y, k, NULL); +} + +static void quantize_row_iq3_s_impl(int block_size, const float * restrict x, void * restrict vy, int n, + const float * restrict quant_weights, + float * scales, + float * weight, + float * xval, + int8_t * L, + int8_t * Laux, + float * waux, + bool * is_on_grid, + bool * is_on_grid_aux, + uint8_t * block_signs) { + + const int gindex = iq3_data_index(512); + + const uint32_t * kgrid_q3xs = iq3_data[gindex].grid; + const int * kmap_q3xs = iq3_data[gindex].map; + const uint16_t * kneighbors_q3xs = iq3_data[gindex].neighbours; + + //GGML_ASSERT(quant_weights && "missing quantization weights"); + GGML_ASSERT(kgrid_q3xs && "forgot to call ggml_quantize_init()?"); + GGML_ASSERT(kmap_q3xs && "forgot to call ggml_quantize_init()?"); + GGML_ASSERT(kneighbors_q3xs && "forgot to call ggml_quantize_init()?"); + GGML_ASSERT(n%QK_K == 0); + + const int kMaxQ = 8; + + const int nbl = n/QK_K; + + block_iq3_s * y = vy; + + const int bs4 = block_size/4; + const int bs8 = block_size/8; + + for (int ibl = 0; ibl < nbl; ++ibl) { + + memset(&y[ibl], 0, sizeof(block_iq3_s)); + y[ibl].d = GGML_FP32_TO_FP16(0.f); + + uint8_t * qs = y[ibl].qs; + uint8_t * qh = y[ibl].qh; + uint8_t * signs = y[ibl].signs; + + float max_scale = 0; + + const float * xbl = x + QK_K*ibl; + float sumx2 = 0; + for (int i = 0; i < QK_K; ++i) sumx2 += xbl[i]*xbl[i]; + float sigma2 = 2*sumx2/QK_K; + + for (int ib = 0; ib < QK_K/block_size; ++ib) { + const float * xb = xbl + block_size*ib; + if (quant_weights) { + const float * qw = quant_weights + QK_K*ibl + block_size*ib; + for (int i = 0; i < block_size; ++i) weight[i] = qw[i] * sqrtf(sigma2 + xb[i]*xb[i]); + } else { + for (int i = 0; i < block_size; ++i) weight[i] = xb[i]*xb[i]; + } + for (int i = 0; i < block_size; ++i) waux[i] = sqrtf(weight[i]); + for (int k = 0; k < bs8; ++k) { + uint8_t s = 0; + for (int i = 0; i < 8; ++i) { + if (xb[8*k + i] >= 0) xval[8*k + i] = xb[8*k + i]; + else { + xval[8*k + i] = -xb[8*k + i]; s |= (1 << i); + } + } + block_signs[k] = s; + } + float max = xval[0]; + for (int i = 1; i < block_size; ++i) max = MAX(max, xval[i]); + if (!max) { + scales[ib] = 0; + continue; + } + float best = 0; + float scale = max/(2*kMaxQ-1); + for (int is = -15; is <= 15; ++is) { + float id = (2*kMaxQ-1+is*0.2f)/max; + float this_scale = 1/id; + for (int k = 0; k < bs4; ++k) { + for (int i = 0; i < 4; ++i) { + int l = nearest_int(0.5f*(id*xval[4*k+i]-1)); + Laux[4*k+i] = MAX(0, MIN(kMaxQ-1, l)); + } + uint16_t u = 0; + for (int i = 0; i < 4; ++i) u |= (Laux[4*k+i] << 3*i); + int grid_index = kmap_q3xs[u]; + is_on_grid_aux[k] = true; + if (grid_index < 0) { + is_on_grid_aux[k] = false; + const uint16_t * neighbours = kneighbors_q3xs - kmap_q3xs[u] - 1; + grid_index = iq3_find_best_neighbour(neighbours, kgrid_q3xs, xval + 4*k, waux + 4*k, this_scale, Laux + 4*k); + } + } + float sumqx = 0, sumq2 = 0; + for (int i = 0; i < block_size; ++i) { + float w = weight[i]; + float q = 2*Laux[i] + 1; + sumqx += w*xval[i]*q; + sumq2 += w*q*q; + } + if (sumq2 > 0 && sumqx*sumqx > best*sumq2) { + scale = sumqx/sumq2; best = scale*sumqx; + for (int i = 0; i < block_size; ++i) L[i] = Laux[i]; + for (int k = 0; k < bs4; ++k) is_on_grid[k] = is_on_grid_aux[k]; + } + } + int n_not_ongrid = 0; + for (int k = 0; k < bs4; ++k) if (!is_on_grid[k]) ++n_not_ongrid; + if (n_not_ongrid > 0 && scale > 0) { + float id = 1/scale; + for (int k = 0; k < bs4; ++k) { + if (is_on_grid[k]) continue; + uint16_t u = 0; + for (int i = 0; i < 4; ++i) { + int l = nearest_int(0.5f*(id*xval[4*k+i]-1)); + l = MAX(0, MIN(kMaxQ-1, l)); + u |= (l << 3*i); + } + int grid_index = kmap_q3xs[u]; + if (grid_index < 0) { + const uint16_t * neighbours = kneighbors_q3xs - kmap_q3xs[u] - 1; + grid_index = iq3_find_best_neighbour(neighbours, kgrid_q3xs, xval + 4*k, waux + 4*k, scale, L + 4*k); + } + const int8_t * pg = (const int8_t *)(kgrid_q3xs + grid_index); + for (int i = 0; i < 4; ++i) L[4*k+i] = (pg[i] - 1)/2; + } + float sumqx = 0, sumq2 = 0; + for (int i = 0; i < block_size; ++i) { + float w = weight[i]; + float q = 2*L[i] + 1; + sumqx += w*xval[i]*q; + sumq2 += w*q*q; + } + if (sumq2 > 0) scale = sumqx/sumq2; + } + if (scale < 0) { + // This should never happen, but just in case, flip scale so that it is positive (we use uint's to encode the scale) + // and correspondingly flip quant signs. + scale = -scale; + for (int k = 0; k < bs8; ++k) block_signs[k] = ~block_signs[k]; + } + for (int k = 0; k < bs4; ++k) { + uint16_t u = 0; + for (int i = 0; i < 4; ++i) u |= (L[4*k+i] << 3*i); + int grid_index = kmap_q3xs[u]; + if (grid_index < 0) { + printf("Oops: found point %u not on grid:", u); + for (int i = 0; i < 4; ++i) printf(" %d", L[4*k+i]); + printf("\n"); + GGML_ASSERT(false); + } + qs[k] = grid_index & 255; + qh[(ib*bs4+k)/8] |= ((grid_index >> 8) << ((ib*bs4+k)%8)); + } + qs += bs4; + for (int k = 0; k < bs8; ++k) signs[k] = block_signs[k]; + signs += bs8; + GGML_ASSERT(scale >= 0); + scales[ib] = scale; + max_scale = MAX(max_scale, scale); + } + + if (!max_scale) { + continue; + } + + float d = max_scale/31; + y[ibl].d = GGML_FP32_TO_FP16(d); + float id = 1/d; + for (int ib = 0; ib < QK_K/block_size; ib += 2) { + int l1 = nearest_int(0.5f*(id*scales[ib+0]-1)); + l1 = MAX(0, MIN(15, l1)); + int l2 = nearest_int(0.5f*(id*scales[ib+1]-1)); + l2 = MAX(0, MIN(15, l2)); + y[ibl].scales[ib/2] = l1 | (l2 << 4); + } + + } +} + +#define IQ3S_BLOCK_SIZE 32 +size_t quantize_iq3_s(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) { + (void)hist; + GGML_ASSERT(n_per_row%QK_K == 0); + int nblock = n_per_row/QK_K; + float scales[QK_K/IQ3S_BLOCK_SIZE]; + float weight[IQ3S_BLOCK_SIZE]; + float xval[IQ3S_BLOCK_SIZE]; + int8_t L[IQ3S_BLOCK_SIZE]; + int8_t Laux[IQ3S_BLOCK_SIZE]; + float waux[IQ3S_BLOCK_SIZE]; + bool is_on_grid[IQ3S_BLOCK_SIZE/4]; + bool is_on_grid_aux[IQ3S_BLOCK_SIZE/4]; + uint8_t block_signs[IQ3S_BLOCK_SIZE/8]; + char * qrow = (char *)dst; + for (int row = 0; row < nrow; ++row) { + quantize_row_iq3_s_impl(IQ3S_BLOCK_SIZE, src, qrow, n_per_row, quant_weights, + scales, weight, xval, L, Laux, waux, is_on_grid, is_on_grid_aux, block_signs); + src += n_per_row; + qrow += nblock*sizeof(block_iq3_s); + } + return nrow * nblock * sizeof(block_iq3_s); +} + +void quantize_row_iq3_s(const float * restrict x, void * restrict vy, int k) { + assert(k % QK_K == 0); + block_iq3_s * restrict y = vy; + quantize_row_iq3_s_reference(x, y, k); } +void quantize_row_iq3_s_reference(const float * restrict x, block_iq3_s * restrict y, int k) { + assert(k % QK_K == 0); + quantize_iq3_s(x, y, 1, k, NULL, NULL); +} + + // =================================== 1.5 bpw =================================================== static int iq1_find_best_neighbour(const uint16_t * restrict neighbours, const uint64_t * restrict grid, diff --git a/ggml-quants.h b/ggml-quants.h index 113623b62938a..303b0b6f9552e 100644 --- a/ggml-quants.h +++ b/ggml-quants.h @@ -191,6 +191,21 @@ typedef struct { } block_iq3_xxs; static_assert(sizeof(block_iq3_xxs) == sizeof(ggml_fp16_t) + 3*(QK_K/8), "wrong iq3_xxs block size/padding"); +// 3.4375 bpw +#if QK_K == 64 +#define IQ3S_N_SCALE 2 +#else +#define IQ3S_N_SCALE QK_K/64 +#endif +typedef struct { + ggml_fp16_t d; + uint8_t qs[QK_K/4]; + uint8_t qh[QK_K/32]; + uint8_t signs[QK_K/8]; + uint8_t scales[IQ3S_N_SCALE]; +} block_iq3_s; +static_assert(sizeof(block_iq3_s) == sizeof(ggml_fp16_t) + 13*(QK_K/32) + IQ3S_N_SCALE, "wrong iq3_s block size/padding"); + typedef struct { ggml_fp16_t d; uint8_t qs[QK_K/8]; @@ -226,6 +241,7 @@ void quantize_row_q6_K_reference(const float * GGML_RESTRICT x, block_q6_K * GGM void quantize_row_q8_K_reference(const float * GGML_RESTRICT x, block_q8_K * GGML_RESTRICT y, int k); void quantize_row_iq3_xxs_reference(const float * GGML_RESTRICT x, block_iq3_xxs * GGML_RESTRICT y, int k); void quantize_row_iq4_nl_reference (const float * GGML_RESTRICT x, block_iq4_nl * GGML_RESTRICT y, int k); +void quantize_row_iq3_s_reference (const float * GGML_RESTRICT x, block_iq3_s * GGML_RESTRICT y, int k); void quantize_row_q4_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k); void quantize_row_q4_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k); @@ -242,6 +258,7 @@ void quantize_row_q6_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, in void quantize_row_q8_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k); void quantize_row_iq3_xxs(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k); void quantize_row_iq4_nl (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k); +void quantize_row_iq3_s (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k); // Dequantization void dequantize_row_q4_0(const block_q4_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int k); @@ -262,6 +279,7 @@ void dequantize_row_iq2_xs (const block_iq2_xs * GGML_RESTRICT x, float * GGML_ void dequantize_row_iq3_xxs(const block_iq3_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int k); void dequantize_row_iq1_s (const block_iq1_s * GGML_RESTRICT x, float * GGML_RESTRICT y, int k); void dequantize_row_iq4_nl (const block_iq4_nl * GGML_RESTRICT x, float * GGML_RESTRICT y, int k); +void dequantize_row_iq3_s (const block_iq3_s * GGML_RESTRICT x, float * GGML_RESTRICT y, int k); // Dot product void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); @@ -280,6 +298,7 @@ void ggml_vec_dot_iq2_xs_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); void ggml_vec_dot_iq1_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); void ggml_vec_dot_iq4_nl_q8_0 (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); +void ggml_vec_dot_iq3_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); // // Quantization utilizing an importance matrix (a.k.a. "Activation aWare Quantization") @@ -289,6 +308,7 @@ size_t quantize_iq2_xs (const float * src, void * dst, int nrows, int n_per_row, size_t quantize_iq3_xxs(const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix); size_t quantize_iq1_s (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix); size_t quantize_iq4_nl (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix); +size_t quantize_iq3_s (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix); size_t quantize_q2_K (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix); size_t quantize_q3_K (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix); size_t quantize_q4_K (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix); diff --git a/ggml.c b/ggml.c index d710fe702ddbd..c09a3cad657f2 100644 --- a/ggml.c +++ b/ggml.c @@ -678,6 +678,18 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { .vec_dot_type = GGML_TYPE_Q8_K, .nrows = 1, }, + [GGML_TYPE_IQ3_S] = { + .type_name = "iq3_s", + .blck_size = QK_K, + .type_size = sizeof(block_iq3_s), + .is_quantized = true, + .to_float = (ggml_to_float_t) dequantize_row_iq3_s, + .from_float = quantize_row_iq3_s, + .from_float_reference = (ggml_from_float_t)quantize_row_iq3_s_reference, + .vec_dot = ggml_vec_dot_iq3_s_q8_K, + .vec_dot_type = GGML_TYPE_Q8_K, + .nrows = 1, + }, [GGML_TYPE_IQ1_S] = { .type_name = "iq1_s", .blck_size = QK_K, @@ -2304,6 +2316,7 @@ enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype) { case GGML_FTYPE_MOSTLY_IQ3_XXS: wtype = GGML_TYPE_IQ3_XXS; break; case GGML_FTYPE_MOSTLY_IQ1_S: wtype = GGML_TYPE_IQ1_S; break; case GGML_FTYPE_MOSTLY_IQ4_NL: wtype = GGML_TYPE_IQ4_NL; break; + case GGML_FTYPE_MOSTLY_IQ3_S: wtype = GGML_TYPE_IQ3_S; break; case GGML_FTYPE_UNKNOWN: wtype = GGML_TYPE_COUNT; break; case GGML_FTYPE_MOSTLY_Q4_1_SOME_F16: wtype = GGML_TYPE_COUNT; break; } @@ -7738,6 +7751,7 @@ static void ggml_compute_forward_add( case GGML_TYPE_IQ3_XXS: case GGML_TYPE_IQ1_S: case GGML_TYPE_IQ4_NL: + case GGML_TYPE_IQ3_S: { ggml_compute_forward_add_q_f32(params, dst); } break; @@ -8017,6 +8031,7 @@ static void ggml_compute_forward_add1( case GGML_TYPE_IQ3_XXS: case GGML_TYPE_IQ1_S: case GGML_TYPE_IQ4_NL: + case GGML_TYPE_IQ3_S: { ggml_compute_forward_add1_q_f32(params, dst); } break; @@ -8141,6 +8156,7 @@ static void ggml_compute_forward_acc( case GGML_TYPE_IQ3_XXS: case GGML_TYPE_IQ1_S: case GGML_TYPE_IQ4_NL: + case GGML_TYPE_IQ3_S: default: { GGML_ASSERT(false); @@ -11039,6 +11055,7 @@ static void ggml_compute_forward_out_prod( case GGML_TYPE_IQ3_XXS: case GGML_TYPE_IQ1_S: case GGML_TYPE_IQ4_NL: + case GGML_TYPE_IQ3_S: { ggml_compute_forward_out_prod_q_f32(params, dst); } break; @@ -11227,6 +11244,7 @@ static void ggml_compute_forward_set( case GGML_TYPE_IQ3_XXS: case GGML_TYPE_IQ1_S: case GGML_TYPE_IQ4_NL: + case GGML_TYPE_IQ3_S: default: { GGML_ASSERT(false); @@ -11429,6 +11447,7 @@ static void ggml_compute_forward_get_rows( case GGML_TYPE_IQ3_XXS: case GGML_TYPE_IQ1_S: case GGML_TYPE_IQ4_NL: + case GGML_TYPE_IQ3_S: { ggml_compute_forward_get_rows_q(params, dst); } break; @@ -12129,6 +12148,7 @@ static void ggml_compute_forward_alibi( case GGML_TYPE_IQ3_XXS: case GGML_TYPE_IQ1_S: case GGML_TYPE_IQ4_NL: + case GGML_TYPE_IQ3_S: case GGML_TYPE_Q8_K: case GGML_TYPE_I8: case GGML_TYPE_I16: @@ -12212,6 +12232,7 @@ static void ggml_compute_forward_clamp( case GGML_TYPE_IQ3_XXS: case GGML_TYPE_IQ1_S: case GGML_TYPE_IQ4_NL: + case GGML_TYPE_IQ3_S: case GGML_TYPE_Q8_K: case GGML_TYPE_I8: case GGML_TYPE_I16: @@ -19463,6 +19484,7 @@ void ggml_quantize_init(enum ggml_type type) { case GGML_TYPE_IQ2_XS: case GGML_TYPE_IQ1_S: iq2xs_init_impl(type); break; case GGML_TYPE_IQ3_XXS: iq3xs_init_impl(256); break; + case GGML_TYPE_IQ3_S: iq3xs_init_impl(512); break; default: // nothing break; } @@ -19737,6 +19759,15 @@ size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, i result = quantize_iq3_xxs(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix); GGML_ASSERT(result == row_size * nrows); } break; + case GGML_TYPE_IQ3_S: + { + GGML_ASSERT(start % QK_K == 0); + GGML_ASSERT(start % n_per_row == 0); + size_t start_row = start / n_per_row; + size_t row_size = ggml_row_size(type, n_per_row); + result = quantize_iq3_s(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix); + GGML_ASSERT(result == row_size * nrows); + } break; case GGML_TYPE_IQ1_S: { GGML_ASSERT(start % QK_K == 0); diff --git a/ggml.h b/ggml.h index 37eff627928e8..a4166e1f7afd0 100644 --- a/ggml.h +++ b/ggml.h @@ -350,6 +350,7 @@ extern "C" { GGML_TYPE_IQ3_XXS = 18, GGML_TYPE_IQ1_S = 19, GGML_TYPE_IQ4_NL = 20, + GGML_TYPE_IQ3_S = 21, GGML_TYPE_I8, GGML_TYPE_I16, GGML_TYPE_I32, @@ -389,6 +390,7 @@ extern "C" { GGML_FTYPE_MOSTLY_IQ3_XXS = 17, // except 1d tensors GGML_FTYPE_MOSTLY_IQ1_S = 18, // except 1d tensors GGML_FTYPE_MOSTLY_IQ4_NL = 19, // except 1d tensors + GGML_FTYPE_MOSTLY_IQ3_S = 20, // except 1d tensors }; // available tensor operations: diff --git a/llama.cpp b/llama.cpp index 37477e6ef3c44..1f6b6cff48987 100644 --- a/llama.cpp +++ b/llama.cpp @@ -2545,6 +2545,7 @@ struct llama_model_loader { case GGML_TYPE_IQ3_XXS: ftype = LLAMA_FTYPE_MOSTLY_IQ3_XXS; break; case GGML_TYPE_IQ1_S: ftype = LLAMA_FTYPE_MOSTLY_IQ1_S; break; case GGML_TYPE_IQ4_NL: ftype = LLAMA_FTYPE_MOSTLY_IQ4_NL; break; + case GGML_TYPE_IQ3_S: ftype = LLAMA_FTYPE_MOSTLY_IQ3_S; break; default: { LLAMA_LOG_WARN("%s: unknown type %s\n", __func__, ggml_type_name(type_max)); @@ -2890,6 +2891,8 @@ static std::string llama_model_ftype_name(llama_ftype ftype) { case LLAMA_FTYPE_MOSTLY_IQ3_XXS:return "IQ3_XXS - 3.0625 bpw"; case LLAMA_FTYPE_MOSTLY_IQ1_S :return "IQ1_S - 1.5625 bpw"; case LLAMA_FTYPE_MOSTLY_IQ4_NL: return "IQ4_NL - 4.5 bpw"; + case LLAMA_FTYPE_MOSTLY_IQ3_S: return "IQ3_S - 3.4375 bpw"; + case LLAMA_FTYPE_MOSTLY_IQ3_M: return "IQ3_S mix - 3.66 bpw"; default: return "unknown, may not work"; } @@ -10544,6 +10547,12 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) { new_type = qs.model.hparams.n_gqa() >= 4 ? GGML_TYPE_Q4_K : !qs.has_imatrix ? GGML_TYPE_Q3_K : GGML_TYPE_IQ3_XXS; } + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S && qs.model.hparams.n_gqa() >= 4) { + new_type = GGML_TYPE_Q4_K; + } + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) { + new_type = GGML_TYPE_Q4_K; + } else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) { new_type = qs.i_attention_wv < 2 ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K; } @@ -10575,13 +10584,17 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty new_type = GGML_TYPE_Q8_0; } else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XS) { - new_type = GGML_TYPE_Q2_K; + new_type = GGML_TYPE_IQ3_XXS; + } + } else if (name.find("attn_q.weight") != std::string::npos) { + if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XS) { + new_type = GGML_TYPE_IQ3_XXS; } } else if (name.find("ffn_down") != std::string::npos) { auto info = layer_info(qs.i_ffn_down, qs.n_ffn_down, name.c_str()); int i_layer = info.first, n_layer = info.second; if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K; - else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XS) { + else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S) { if (i_layer < n_layer/8) new_type = GGML_TYPE_Q4_K; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS && !qs.has_imatrix) { @@ -10592,6 +10605,10 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty : arch != LLM_ARCH_FALCON || use_more_bits(i_layer, n_layer) ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K; } + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M && (i_layer < n_layer/8 || + (qs.model.hparams.n_expert == 8 && use_more_bits(i_layer, n_layer)))) { + new_type = GGML_TYPE_Q4_K; + } else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) { new_type = arch == LLM_ARCH_FALCON ? GGML_TYPE_Q4_K : GGML_TYPE_Q5_K; } @@ -10623,37 +10640,41 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty if (qs.model.hparams.n_expert == 8) { if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL || - ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) { + ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_IQ3_S || + ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) { new_type = GGML_TYPE_Q5_K; } } else { - if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K ) new_type = GGML_TYPE_Q3_K; + if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K ) new_type = GGML_TYPE_Q3_K; else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) new_type = GGML_TYPE_Q3_K; - else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) new_type = GGML_TYPE_Q4_K; - else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K; + else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M ) new_type = GGML_TYPE_Q4_K; + else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L ) new_type = GGML_TYPE_Q5_K; + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M ) new_type = GGML_TYPE_Q4_K; } } else { if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q4_K; } } else if (name.find("attn_qkv.weight") != std::string::npos) { - if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q4_K; + if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L || ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) { + new_type = GGML_TYPE_Q4_K; + } else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) new_type = GGML_TYPE_Q5_K; else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q6_K; } else if (name.find("ffn_gate") != std::string::npos) { auto info = layer_info(qs.i_ffn_gate, qs.n_ffn_gate, name.c_str()); int i_layer = info.first, n_layer = info.second; - if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XS && !use_more_bits(i_layer, n_layer)) { - new_type = GGML_TYPE_Q2_K; + if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XS && (i_layer >= n_layer/8 && i_layer < 7*n_layer/8)) { + new_type = GGML_TYPE_IQ3_XXS; } ++qs.i_ffn_gate; } else if (name.find("ffn_up") != std::string::npos) { auto info = layer_info(qs.i_ffn_up, qs.n_ffn_up, name.c_str()); int i_layer = info.first, n_layer = info.second; - if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XS && !use_more_bits(i_layer, n_layer)) { - new_type = GGML_TYPE_Q2_K; + if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_XS && (i_layer >= n_layer/8 && i_layer < 7*n_layer/8)) { + new_type = GGML_TYPE_IQ3_XXS; } ++qs.i_ffn_up; } @@ -10673,7 +10694,7 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty if (new_type == GGML_TYPE_Q2_K || new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K || new_type == GGML_TYPE_Q5_K || new_type == GGML_TYPE_Q6_K || new_type == GGML_TYPE_IQ2_XS || new_type == GGML_TYPE_IQ2_XXS || - new_type == GGML_TYPE_IQ3_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S) { + new_type == GGML_TYPE_IQ3_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || new_type == GGML_TYPE_IQ3_S) { int nx = tensor->ne[0]; int ny = tensor->ne[1]; if (nx % QK_K != 0) { @@ -10688,6 +10709,7 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty case GGML_TYPE_IQ2_XXS: case GGML_TYPE_IQ2_XS: case GGML_TYPE_IQ3_XXS: + case GGML_TYPE_IQ3_S: case GGML_TYPE_IQ1_S: case GGML_TYPE_Q2_K: case GGML_TYPE_Q3_K: new_type = GGML_TYPE_IQ4_NL; break; @@ -10719,7 +10741,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s // K-quants case LLAMA_FTYPE_MOSTLY_Q2_K_S: case LLAMA_FTYPE_MOSTLY_Q2_K: quantized_type = GGML_TYPE_Q2_K; break; - case LLAMA_FTYPE_MOSTLY_Q3_K_XS: + case LLAMA_FTYPE_MOSTLY_Q3_K_XS: quantized_type = GGML_TYPE_IQ3_S; break; case LLAMA_FTYPE_MOSTLY_Q3_K_S: case LLAMA_FTYPE_MOSTLY_Q3_K_M: case LLAMA_FTYPE_MOSTLY_Q3_K_L: quantized_type = GGML_TYPE_Q3_K; break; @@ -10733,6 +10755,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s case LLAMA_FTYPE_MOSTLY_IQ3_XXS: quantized_type = GGML_TYPE_IQ3_XXS; break; case LLAMA_FTYPE_MOSTLY_IQ1_S: quantized_type = GGML_TYPE_IQ1_S; break; case LLAMA_FTYPE_MOSTLY_IQ4_NL: quantized_type = GGML_TYPE_IQ4_NL; break; + case LLAMA_FTYPE_MOSTLY_IQ3_S: quantized_type = GGML_TYPE_IQ3_S; break; + case LLAMA_FTYPE_MOSTLY_IQ3_M: quantized_type = GGML_TYPE_IQ3_S; break; default: throw std::runtime_error(format("invalid output file type %d\n", ftype)); } diff --git a/llama.h b/llama.h index 84f196b3bb625..889edf4d97b96 100644 --- a/llama.h +++ b/llama.h @@ -102,6 +102,8 @@ extern "C" { LLAMA_FTYPE_MOSTLY_IQ3_XXS = 23, // except 1d tensors LLAMA_FTYPE_MOSTLY_IQ1_S = 24, // except 1d tensors LLAMA_FTYPE_MOSTLY_IQ4_NL = 25, // except 1d tensors + LLAMA_FTYPE_MOSTLY_IQ3_S = 26, // except 1d tensors + LLAMA_FTYPE_MOSTLY_IQ3_M = 27, // except 1d tensors LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file }; diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index 55db42bf6e851..f8574588bcf2f 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -1918,7 +1918,7 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op GGML_TYPE_Q6_K, GGML_TYPE_IQ2_XXS, GGML_TYPE_IQ2_XS, GGML_TYPE_IQ3_XXS, GGML_TYPE_IQ1_S, - GGML_TYPE_IQ4_NL, + GGML_TYPE_IQ4_NL, GGML_TYPE_IQ3_S, }; // unary ops diff --git a/tests/test-quantize-fns.cpp b/tests/test-quantize-fns.cpp index 5e92d5742a3cc..04656bb9e8e83 100644 --- a/tests/test-quantize-fns.cpp +++ b/tests/test-quantize-fns.cpp @@ -151,6 +151,7 @@ int main(int argc, char * argv[]) { const float max_quantization_error = type == GGML_TYPE_Q2_K ? MAX_QUANTIZATION_TOTAL_ERROR_2BITS : type == GGML_TYPE_Q3_K ? MAX_QUANTIZATION_TOTAL_ERROR_3BITS : + type == GGML_TYPE_IQ3_S ? MAX_QUANTIZATION_TOTAL_ERROR_3BITS : type == GGML_TYPE_IQ3_XXS ? MAX_QUANTIZATION_TOTAL_ERROR_3BITS_XXS : MAX_QUANTIZATION_TOTAL_ERROR; failed = !(total_error < max_quantization_error); num_failed += failed; @@ -167,7 +168,8 @@ int main(int argc, char * argv[]) { const float vec_dot_error = dot_product_error(qfns, test_size, test_data.data(), test_data2.data()); const float max_allowed_error = type == GGML_TYPE_Q2_K || type == GGML_TYPE_IQ2_XS || type == GGML_TYPE_IQ2_XXS || - type == GGML_TYPE_IQ3_XXS ? MAX_DOT_PRODUCT_ERROR_LOWBIT : MAX_DOT_PRODUCT_ERROR; + type == GGML_TYPE_IQ3_XXS || type == GGML_TYPE_IQ3_S ? MAX_DOT_PRODUCT_ERROR_LOWBIT + : MAX_DOT_PRODUCT_ERROR; failed = !(vec_dot_error < max_allowed_error); num_failed += failed; if (failed || verbose) { From 9e359a4f47c1b2dceb99e29706c9f7403d32ab5e Mon Sep 17 00:00:00 2001 From: Pierrick Hymbert Date: Sat, 24 Feb 2024 19:16:04 +0100 Subject: [PATCH 03/18] server: continue to update other slots on embedding concurrent request (#5699) * server: #5655 - continue to update other slots on embedding concurrent request. * server: tests: add multi users embeddings as fixed * server: tests: adding OAI compatible embedding concurrent endpoint * server: tests: adding OAI compatible embedding with multiple inputs --- examples/server/server.cpp | 2 +- examples/server/tests/features/issues.feature | 34 +--- .../server/tests/features/parallel.feature | 46 ++++++ examples/server/tests/features/server.feature | 13 ++ examples/server/tests/features/steps/steps.py | 151 +++++++++++++----- 5 files changed, 168 insertions(+), 78 deletions(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 9fb436c2a18ec..19a8c1067e72a 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -1836,7 +1836,7 @@ struct llama_server_context send_embedding(slot); slot.release(); slot.i_batch = -1; - return true; + continue; } completion_token_output result; diff --git a/examples/server/tests/features/issues.feature b/examples/server/tests/features/issues.feature index 542006d9a8df2..bf5a175a357ca 100644 --- a/examples/server/tests/features/issues.feature +++ b/examples/server/tests/features/issues.feature @@ -1,36 +1,4 @@ # List of ongoing issues @bug Feature: Issues - # Issue #5655 - Scenario: Multi users embeddings - Given a server listening on localhost:8080 - And a model file stories260K.gguf - And a model alias tinyllama-2 - And 42 as server seed - And 64 KV cache size - And 2 slots - And continuous batching - And embeddings extraction - Then the server is starting - Then the server is healthy - - Given a prompt: - """ - Write a very long story about AI. - """ - And a prompt: - """ - Write another very long music lyrics. - """ - And a prompt: - """ - Write a very long poem. - """ - And a prompt: - """ - Write a very long joke. - """ - Given concurrent embedding requests - Then the server is busy - Then the server is idle - Then all embeddings are generated + # No confirmed issue at the moment diff --git a/examples/server/tests/features/parallel.feature b/examples/server/tests/features/parallel.feature index 802d624ffc9a3..c85f9de1d9a52 100644 --- a/examples/server/tests/features/parallel.feature +++ b/examples/server/tests/features/parallel.feature @@ -8,6 +8,7 @@ Feature: Parallel And 42 as server seed And 64 KV cache size And 2 slots + And embeddings extraction And continuous batching Then the server is starting Then the server is healthy @@ -75,3 +76,48 @@ Feature: Parallel Then the server is busy Then the server is idle Then all prompts are predicted + + Scenario: Multi users embeddings + Given a prompt: + """ + Write a very long story about AI. + """ + And a prompt: + """ + Write another very long music lyrics. + """ + And a prompt: + """ + Write a very long poem. + """ + And a prompt: + """ + Write a very long joke. + """ + Given concurrent embedding requests + Then the server is busy + Then the server is idle + Then all embeddings are generated + + Scenario: Multi users OAI compatibility embeddings + Given a prompt: + """ + In which country Paris is located ? + """ + And a prompt: + """ + Is Madrid the capital of Spain ? + """ + And a prompt: + """ + What is the biggest US city ? + """ + And a prompt: + """ + What is the capital of Bulgaria ? + """ + And a model tinyllama-2 + Given concurrent OAI embedding requests + Then the server is busy + Then the server is idle + Then all embeddings are generated diff --git a/examples/server/tests/features/server.feature b/examples/server/tests/features/server.feature index fedcfe5aef1b3..5f81d256a548c 100644 --- a/examples/server/tests/features/server.feature +++ b/examples/server/tests/features/server.feature @@ -60,6 +60,19 @@ Feature: llama.cpp server """ Then embeddings are generated + Scenario: OAI Embeddings compatibility with multiple inputs + Given a model tinyllama-2 + Given a prompt: + """ + In which country Paris is located ? + """ + And a prompt: + """ + Is Madrid the capital of Spain ? + """ + When an OAI compatible embeddings computation request for multiple inputs + Then embeddings are generated + Scenario: Tokenize / Detokenize When tokenizing: diff --git a/examples/server/tests/features/steps/steps.py b/examples/server/tests/features/steps/steps.py index 50f2b641e764e..9c825fdbcd7f5 100644 --- a/examples/server/tests/features/steps/steps.py +++ b/examples/server/tests/features/steps/steps.py @@ -1,4 +1,5 @@ import asyncio +import collections import json import os import re @@ -261,35 +262,35 @@ def step_a_prompt_prompt(context, prompt): @step(u'concurrent completion requests') @async_run_until_complete() async def step_concurrent_completion_requests(context): - await concurrent_completion_requests(context, - request_completion, - # prompt is inserted automatically - context.base_url, - debug=context.debug, - n_predict=context.n_predict if hasattr(context, 'n_predict') else None, - server_seed=context.server_seed if hasattr(context, 'server_seed') else None, - user_api_key=context.user_api_key if hasattr(context, - 'user_api_key') else None) + await concurrent_requests(context, + request_completion, + # prompt is inserted automatically + context.base_url, + debug=context.debug, + n_predict=context.n_predict if hasattr(context, 'n_predict') else None, + server_seed=context.server_seed if hasattr(context, 'server_seed') else None, + user_api_key=context.user_api_key if hasattr(context, + 'user_api_key') else None) @step(u'concurrent OAI completions requests') @async_run_until_complete async def step_oai_chat_completions(context): - await concurrent_completion_requests(context, oai_chat_completions, - # user_prompt is inserted automatically - context.system_prompt, - context.base_url, - True, # async_client - model=context.model - if hasattr(context, 'model') else None, - n_predict=context.n_predict - if hasattr(context, 'n_predict') else None, - enable_streaming=context.enable_streaming - if hasattr(context, 'enable_streaming') else None, - server_seed=context.server_seed - if hasattr(context, 'server_seed') else None, - user_api_key=context.user_api_key - if hasattr(context, 'user_api_key') else None) + await concurrent_requests(context, oai_chat_completions, + # user_prompt is inserted automatically + context.system_prompt, + context.base_url, + True, # async_client + model=context.model + if hasattr(context, 'model') else None, + n_predict=context.n_predict + if hasattr(context, 'n_predict') else None, + enable_streaming=context.enable_streaming + if hasattr(context, 'enable_streaming') else None, + server_seed=context.server_seed + if hasattr(context, 'server_seed') else None, + user_api_key=context.user_api_key + if hasattr(context, 'user_api_key') else None) @step(u'all prompts are predicted') @@ -316,36 +317,58 @@ async def all_prompts_are_predicted(context, expected_predicted_n=None): @step(u'embeddings are computed for') @async_run_until_complete async def step_compute_embedding(context): - content = context.text - base_url = context.base_url - context.embeddings = await request_embedding(content, base_url) + context.embeddings = await request_embedding(context.text, base_url=context.base_url) @step(u'embeddings are generated') def step_assert_embeddings(context): - assert_embeddings(context.embeddings) + if len(context.prompts) == 0: + assert_embeddings(context.embeddings) + else: + assert len(context.embeddings) == len(context.prompts), (f"unexpected response:\n" + f"context.prompts={context.prompts}\n" + f"context.embeddings={context.embeddings}") + for embedding in context.embeddings: + context.prompts.pop() + assert_embeddings(embedding) @step(u'an OAI compatible embeddings computation request for') -def step_oai_compute_embedding(context): - openai.api_key = 'nope' # openai client always expects an api_keu - if context.user_api_key is not None: - openai.api_key = context.user_api_key - openai.api_base = f'{context.base_url}/v1' - embeddings = openai.Embedding.create( - model=context.model, - input=context.text, - ) - context.embeddings = embeddings +@async_run_until_complete +async def step_oai_compute_embeddings(context): + context.embeddings = await request_oai_embeddings(context.text, + base_url=context.base_url, + user_api_key=context.user_api_key, + model=context.model) + + +@step(u'an OAI compatible embeddings computation request for multiple inputs') +@async_run_until_complete +async def step_oai_compute_embeddings_multiple_inputs(context): + context.embeddings = await request_oai_embeddings(context.prompts, + base_url=context.base_url, + user_api_key=context.user_api_key, + model=context.model) @step(u'concurrent embedding requests') @async_run_until_complete() async def step_concurrent_embedding_requests(context): - await concurrent_completion_requests(context, - request_embedding, - # prompt is inserted automatically - context.base_url) + await concurrent_requests(context, + request_embedding, + # prompt is inserted automatically + base_url=context.base_url) + + +@step(u'concurrent OAI embedding requests') +@async_run_until_complete() +async def step_concurrent_oai_embedding_requests(context): + await concurrent_requests(context, + request_oai_embeddings, + # prompt is inserted automatically + base_url=context.base_url, + async_client=True, + model=context.model) @step(u'all embeddings are generated') @@ -401,7 +424,7 @@ def step_check_options_header_value(context, cors_header, cors_header_value): assert context.options_response.headers[cors_header] == cors_header_value -async def concurrent_completion_requests(context, f_completion, *args, **kwargs): +async def concurrent_requests(context, f_completion, *args, **kwargs): n_prompts = len(context.prompts) if context.debug: print(f"starting {n_prompts} concurrent completion requests...") @@ -565,7 +588,7 @@ async def oai_chat_completions(user_prompt, return completion_response -async def request_embedding(content, base_url): +async def request_embedding(content, base_url=None): async with aiohttp.ClientSession() as session: async with session.post(f'{base_url}/embedding', json={ @@ -576,6 +599,46 @@ async def request_embedding(content, base_url): return response_json['embedding'] +async def request_oai_embeddings(input, + base_url=None, user_api_key=None, + model=None, async_client=False): + # openai client always expects an api_key + user_api_key = user_api_key if user_api_key is not None else 'nope' + if async_client: + origin = 'llama.cpp' + if user_api_key is not None: + headers = {'Authorization': f'Bearer {user_api_key}', 'Origin': origin} + async with aiohttp.ClientSession() as session: + async with session.post(f'{base_url}/v1/embeddings', + json={ + "input": input, + "model": model, + }, + headers=headers) as response: + assert response.status == 200, f"received status code not expected: {response.status}" + assert response.headers['Access-Control-Allow-Origin'] == origin + assert response.headers['Content-Type'] == "application/json; charset=utf-8" + response_json = await response.json() + assert response_json['model'] == model, f"invalid model received: {response_json['model']}" + assert response_json['object'] == 'list' + return response_json['data'] + else: + openai.api_key = user_api_key + openai.api_base = f'{base_url}/v1' + oai_embeddings = openai.Embedding.create( + model=model, + input=input, + ) + + if isinstance(input, collections.abc.Sequence): + embeddings = [] + for an_oai_embeddings in oai_embeddings.data: + embeddings.append(an_oai_embeddings.embedding) + else: + embeddings = oai_embeddings.data.embedding + return embeddings + + def assert_n_tokens_predicted(completion_response, expected_predicted_n=None, re_content=None): content = completion_response['content'] n_predicted = completion_response['timings']['predicted_n'] From 69917dfa55674c608360638bb4d6a12a315e2810 Mon Sep 17 00:00:00 2001 From: Anas Ahouzi <112881240+aahouzi@users.noreply.github.com> Date: Sun, 25 Feb 2024 10:54:04 +0100 Subject: [PATCH 04/18] py : fix StableLM conversion after config.json changes (#5703) * Fix issues during StableLM models conversion * Fix hard coded layer_norm_eps * Support layer_norm_eps for LlavaStableLM Co-authored-by: Jared Van Bortel * Add missing parenthesis Co-authored-by: Jared Van Bortel * Support rotary_factor for LlavaStableLM Co-authored-by: Jared Van Bortel * fix typo * Add StableLMEpochForCausalLM for safety Co-authored-by: compilade <113953597+compilade@users.noreply.github.com> * Add StableLMEpochForCausalLM for safety 2 Co-authored-by: compilade <113953597+compilade@users.noreply.github.com> --------- Co-authored-by: Jared Van Bortel Co-authored-by: Jared Van Bortel Co-authored-by: compilade <113953597+compilade@users.noreply.github.com> --- convert-hf-to-gguf.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index 32d54b45f3325..ae30b2a76971a 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -192,7 +192,7 @@ def from_model_architecture(model_architecture): return RefactModel if model_architecture == "PersimmonForCausalLM": return PersimmonModel - if model_architecture in ("StableLMEpochForCausalLM", "LlavaStableLMEpochForCausalLM"): + if model_architecture in ("StableLmForCausalLM", "StableLMEpochForCausalLM", "LlavaStableLMEpochForCausalLM"): return StableLMModel if model_architecture == "QWenLMHeadModel": return QwenModel @@ -253,7 +253,7 @@ def _get_model_architecture(self) -> gguf.MODEL_ARCH: return gguf.MODEL_ARCH.REFACT if arch == "PersimmonForCausalLM": return gguf.MODEL_ARCH.PERSIMMON - if arch in ("StableLMEpochForCausalLM", "LlavaStableLMEpochForCausalLM"): + if arch in ("StableLmForCausalLM", "StableLMEpochForCausalLM", "LlavaStableLMEpochForCausalLM"): return gguf.MODEL_ARCH.STABLELM if arch == "QWenLMHeadModel": return gguf.MODEL_ARCH.QWEN @@ -1074,10 +1074,11 @@ def set_gguf_parameters(self): self.gguf_writer.add_embedding_length(hparams["hidden_size"]) self.gguf_writer.add_block_count(block_count) self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"]) - self.gguf_writer.add_rope_dimension_count(int(hparams["rope_pct"] * (hparams["hidden_size"] // hparams["num_attention_heads"]))) + rotary_factor = self.find_hparam(["partial_rotary_factor", "rope_pct"]) + self.gguf_writer.add_rope_dimension_count(int(rotary_factor * (hparams["hidden_size"] // hparams["num_attention_heads"]))) self.gguf_writer.add_head_count(hparams["num_attention_heads"]) self.gguf_writer.add_parallel_residual(hparams["use_parallel_residual"] if "use_parallel_residual" in hparams else True) - self.gguf_writer.add_layer_norm_eps(1e-5) + self.gguf_writer.add_layer_norm_eps(self.find_hparam(["layer_norm_eps", "norm_eps"])) class MixtralModel(Model): From ab336a9d5e5352ecdcdf4c12d2d54cf4ef82ce31 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 25 Feb 2024 12:09:09 +0200 Subject: [PATCH 05/18] code : normalize enum names (#5697) * coda : normalize enum names ggml-ci * code : cont * code : cont --- common/common.cpp | 18 +- common/common.h | 4 +- common/train.cpp | 10 +- examples/baby-llama/baby-llama.cpp | 2 +- examples/finetune/finetune.cpp | 2 +- examples/llama-bench/llama-bench.cpp | 14 +- examples/llava/llava.cpp | 2 +- examples/server/server.cpp | 18 +- .../train-text-from-scratch.cpp | 2 +- ggml-cuda.cu | 138 +++---- ggml-metal.m | 4 +- ggml-opencl.cpp | 50 +-- ggml-sycl.cpp | 152 ++++---- ggml-vulkan.cpp | 102 ++--- ggml.c | 350 +++++++++--------- ggml.h | 38 +- llama.cpp | 64 ++-- llama.h | 28 +- tests/test-backend-ops.cpp | 4 +- tests/test-opt.cpp | 2 +- 20 files changed, 502 insertions(+), 502 deletions(-) diff --git a/common/common.cpp b/common/common.cpp index 10ef11829cc50..ec596f5a075de 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -295,9 +295,9 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { break; } std::string value(argv[i]); - /**/ if (value == "none") { params.rope_scaling_type = LLAMA_ROPE_SCALING_NONE; } - else if (value == "linear") { params.rope_scaling_type = LLAMA_ROPE_SCALING_LINEAR; } - else if (value == "yarn") { params.rope_scaling_type = LLAMA_ROPE_SCALING_YARN; } + /**/ if (value == "none") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_NONE; } + else if (value == "linear") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_LINEAR; } + else if (value == "yarn") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_YARN; } else { invalid_param = true; break; } } else if (arg == "--rope-scale") { if (++i >= argc) { @@ -630,11 +630,11 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { } std::string arg_next = argv[i]; if (arg_next == "none") { - params.split_mode = LLAMA_SPLIT_NONE; + params.split_mode = LLAMA_SPLIT_MODE_NONE; } else if (arg_next == "layer") { - params.split_mode = LLAMA_SPLIT_LAYER; + params.split_mode = LLAMA_SPLIT_MODE_LAYER; } else if (arg_next == "row") { - params.split_mode = LLAMA_SPLIT_ROW; + params.split_mode = LLAMA_SPLIT_MODE_ROW; } else { invalid_param = true; break; @@ -837,15 +837,15 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { sep++; if (strncmp(sep, "int:", 4) == 0) { sep += 4; - kvo.tag = LLAMA_KV_OVERRIDE_INT; + kvo.tag = LLAMA_KV_OVERRIDE_TYPE_INT; kvo.int_value = std::atol(sep); } else if (strncmp(sep, "float:", 6) == 0) { sep += 6; - kvo.tag = LLAMA_KV_OVERRIDE_FLOAT; + kvo.tag = LLAMA_KV_OVERRIDE_TYPE_FLOAT; kvo.float_value = std::atof(sep); } else if (strncmp(sep, "bool:", 5) == 0) { sep += 5; - kvo.tag = LLAMA_KV_OVERRIDE_BOOL; + kvo.tag = LLAMA_KV_OVERRIDE_TYPE_BOOL; if (std::strcmp(sep, "true") == 0) { kvo.bool_value = true; } else if (std::strcmp(sep, "false") == 0) { diff --git a/common/common.h b/common/common.h index 935771d44ca9c..3e21579b00545 100644 --- a/common/common.h +++ b/common/common.h @@ -61,7 +61,7 @@ struct gpt_params { float p_split = 0.1f; // speculative decoding split probability int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default) int32_t n_gpu_layers_draft = -1; // number of layers to store in VRAM for the draft model (-1 - use default) - llama_split_mode split_mode = LLAMA_SPLIT_LAYER; // how to split the model across GPUs + llama_split_mode split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors float tensor_split[128] = {0}; // how split tensors should be distributed across GPUs int32_t n_beams = 0; // if non-zero then use beam search of given width. @@ -75,7 +75,7 @@ struct gpt_params { float yarn_beta_fast = 32.0f; // YaRN low correction dim float yarn_beta_slow = 1.0f; // YaRN high correction dim int32_t yarn_orig_ctx = 0; // YaRN original context length - int32_t rope_scaling_type = LLAMA_ROPE_SCALING_UNSPECIFIED; + int32_t rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED; ggml_numa_strategy numa = GGML_NUMA_STRATEGY_DISABLED; // // sampling parameters diff --git a/common/train.cpp b/common/train.cpp index e4c3d5df61818..0dbfd24df2314 100644 --- a/common/train.cpp +++ b/common/train.cpp @@ -31,7 +31,7 @@ struct train_state * init_train_state() { state->opt = new struct ggml_opt_context; state->opt->ctx = NULL; - state->opt->params = ggml_opt_default_params(GGML_OPT_ADAM); + state->opt->params = ggml_opt_default_params(GGML_OPT_TYPE_ADAM); state->opt->params.graph_size = LLAMA_TRAIN_MAX_NODES; state->opt->loss_after = 0.0f; @@ -556,7 +556,7 @@ void load_opt_context_gguf(struct gguf_context * fctx, struct ggml_context * f_g std::string opt_type; GGUF_GET_KEY(fctx, opt_type, gguf_get_val_str, GGUF_TYPE_STRING, true, LLM_KV_OPTIMIZER_TYPE); if (opt_type == LLM_KV_OPTIMIZER_TYPE_ADAM) { - opt->params.type = GGML_OPT_ADAM; + opt->params.type = GGML_OPT_TYPE_ADAM; GGUF_GET_KEY(fctx, opt->adam.fx_best, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, LLM_KV_OPTIMIZER_ADAM_BEST_LOSS); GGUF_GET_KEY(fctx, opt->adam.fx_prev, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, LLM_KV_OPTIMIZER_ADAM_PREVIOUS_LOSS); @@ -568,7 +568,7 @@ void load_opt_context_gguf(struct gguf_context * fctx, struct ggml_context * f_g copy_tensor_by_name(opt->adam.v, f_ggml_ctx, LLM_TENSOR_OPTIMIZER_ADAM_SECOND_MOMENTS); copy_tensor_by_name(opt->adam.pf, f_ggml_ctx, LLM_TENSOR_OPTIMIZER_ADAM_PAST_LOSS_VALUES); } else if (opt_type == LLM_KV_OPTIMIZER_TYPE_LBFGS) { - opt->params.type = GGML_OPT_LBFGS; + opt->params.type = GGML_OPT_TYPE_LBFGS; GGUF_GET_KEY(fctx, opt->params.lbfgs.m, gguf_get_val_u32, GGUF_TYPE_UINT32, true, LLM_KV_OPTIMIZER_LBFGS_APPROX_HESSIAN_COUNT); GGUF_GET_KEY(fctx, opt->lbfgs.fx_best, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, LLM_KV_OPTIMIZER_LBFGS_BEST_LOSS); @@ -603,7 +603,7 @@ void save_opt_context_gguf(struct gguf_context * fctx, struct ggml_opt_context * gguf_set_val_bool(fctx, LLM_KV_OPTIMIZER_JUST_INITIALIZED, opt->just_initialized); switch (opt->params.type) { - case GGML_OPT_ADAM: + case GGML_OPT_TYPE_ADAM: { gguf_set_val_str(fctx, LLM_KV_OPTIMIZER_TYPE, LLM_KV_OPTIMIZER_TYPE_ADAM); gguf_set_val_f32(fctx, LLM_KV_OPTIMIZER_ADAM_BEST_LOSS, opt->adam.fx_best); @@ -622,7 +622,7 @@ void save_opt_context_gguf(struct gguf_context * fctx, struct ggml_opt_context * gguf_add_tensor(fctx, opt->adam.pf); } } break; - case GGML_OPT_LBFGS: + case GGML_OPT_TYPE_LBFGS: { gguf_set_val_str(fctx, LLM_KV_OPTIMIZER_TYPE, LLM_KV_OPTIMIZER_TYPE_LBFGS); gguf_set_val_u32(fctx, LLM_KV_OPTIMIZER_LBFGS_APPROX_HESSIAN_COUNT, opt->params.lbfgs.m); diff --git a/examples/baby-llama/baby-llama.cpp b/examples/baby-llama/baby-llama.cpp index 65bb238a0d565..bf0125e753746 100644 --- a/examples/baby-llama/baby-llama.cpp +++ b/examples/baby-llama/baby-llama.cpp @@ -1547,7 +1547,7 @@ int main(int argc, char ** argv) { float error_before_opt = ggml_get_f32_1d(e, 0); - struct ggml_opt_params opt_params_lbfgs = ggml_opt_default_params(GGML_OPT_LBFGS); + struct ggml_opt_params opt_params_lbfgs = ggml_opt_default_params(GGML_OPT_TYPE_LBFGS); opt_params_lbfgs.print_forward_graph = false; opt_params_lbfgs.print_backward_graph = false; opt_params_lbfgs.lbfgs.n_iter = 16; diff --git a/examples/finetune/finetune.cpp b/examples/finetune/finetune.cpp index 98bf5a07a7ed1..3da5317b3d910 100644 --- a/examples/finetune/finetune.cpp +++ b/examples/finetune/finetune.cpp @@ -1531,7 +1531,7 @@ int main(int argc, char ** argv) { lora.hparams.n_rank_output = n_rank_output; // set opt params from command line - opt->params = ggml_opt_default_params(GGML_OPT_ADAM); + opt->params = ggml_opt_default_params(GGML_OPT_TYPE_ADAM); opt->params.print_forward_graph = false; opt->params.print_backward_graph = false; opt->params.graph_size = LLAMA_TRAIN_MAX_NODES; diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp index 11410f8ae7625..8fec3d43ddfdd 100644 --- a/examples/llama-bench/llama-bench.cpp +++ b/examples/llama-bench/llama-bench.cpp @@ -157,9 +157,9 @@ static const char * output_format_str(output_formats format) { static const char * split_mode_str(llama_split_mode mode) { switch (mode) { - case LLAMA_SPLIT_NONE: return "none"; - case LLAMA_SPLIT_LAYER: return "layer"; - case LLAMA_SPLIT_ROW: return "row"; + case LLAMA_SPLIT_MODE_NONE: return "none"; + case LLAMA_SPLIT_MODE_LAYER: return "layer"; + case LLAMA_SPLIT_MODE_ROW: return "row"; default: GGML_ASSERT(!"invalid split mode"); } } @@ -193,7 +193,7 @@ static const cmd_params cmd_params_defaults = { /* type_v */ {GGML_TYPE_F16}, /* n_threads */ {get_num_physical_cores()}, /* n_gpu_layers */ {99}, - /* split_mode */ {LLAMA_SPLIT_LAYER}, + /* split_mode */ {LLAMA_SPLIT_MODE_LAYER}, /* main_gpu */ {0}, /* no_kv_offload */ {false}, /* mul_mat_q */ {true}, @@ -358,11 +358,11 @@ static cmd_params parse_cmd_params(int argc, char ** argv) { for (const auto & m : p) { llama_split_mode mode; if (m == "none") { - mode = LLAMA_SPLIT_NONE; + mode = LLAMA_SPLIT_MODE_NONE; } else if (m == "layer") { - mode = LLAMA_SPLIT_LAYER; + mode = LLAMA_SPLIT_MODE_LAYER; } else if (m == "row") { - mode = LLAMA_SPLIT_ROW; + mode = LLAMA_SPLIT_MODE_ROW; } else { invalid_param = true; break; diff --git a/examples/llava/llava.cpp b/examples/llava/llava.cpp index 1a1cf7c78bf34..9801281661b25 100644 --- a/examples/llava/llava.cpp +++ b/examples/llava/llava.cpp @@ -152,7 +152,7 @@ static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector ggml_tensor * newline_tmp = clip_get_newline_tensor(ctx_clip); model.newline = ggml_new_tensor_1d(model.ctx, GGML_TYPE_F32, newline_tmp->ne[0]); - if (newline_tmp->backend != GGML_BACKEND_CPU) { + if (newline_tmp->backend != GGML_BACKEND_TYPE_CPU) { if (newline_tmp->buffer == NULL) { printf("newline_tmp tensor buffer is NULL\n"); } diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 19a8c1067e72a..780862ef67810 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -2086,9 +2086,9 @@ static void server_params_parse(int argc, char **argv, server_params &sparams, break; } std::string value(argv[i]); - /**/ if (value == "none") { params.rope_scaling_type = LLAMA_ROPE_SCALING_NONE; } - else if (value == "linear") { params.rope_scaling_type = LLAMA_ROPE_SCALING_LINEAR; } - else if (value == "yarn") { params.rope_scaling_type = LLAMA_ROPE_SCALING_YARN; } + /**/ if (value == "none") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_NONE; } + else if (value == "linear") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_LINEAR; } + else if (value == "yarn") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_YARN; } else { invalid_param = true; break; } } else if (arg == "--rope-freq-base") @@ -2212,15 +2212,15 @@ static void server_params_parse(int argc, char **argv, server_params &sparams, std::string arg_next = argv[i]; if (arg_next == "none") { - params.split_mode = LLAMA_SPLIT_NONE; + params.split_mode = LLAMA_SPLIT_MODE_NONE; } else if (arg_next == "layer") { - params.split_mode = LLAMA_SPLIT_LAYER; + params.split_mode = LLAMA_SPLIT_MODE_LAYER; } else if (arg_next == "row") { - params.split_mode = LLAMA_SPLIT_ROW; + params.split_mode = LLAMA_SPLIT_MODE_ROW; } else { invalid_param = true; @@ -2447,15 +2447,15 @@ static void server_params_parse(int argc, char **argv, server_params &sparams, sep++; if (strncmp(sep, "int:", 4) == 0) { sep += 4; - kvo.tag = LLAMA_KV_OVERRIDE_INT; + kvo.tag = LLAMA_KV_OVERRIDE_TYPE_INT; kvo.int_value = std::atol(sep); } else if (strncmp(sep, "float:", 6) == 0) { sep += 6; - kvo.tag = LLAMA_KV_OVERRIDE_FLOAT; + kvo.tag = LLAMA_KV_OVERRIDE_TYPE_FLOAT; kvo.float_value = std::atof(sep); } else if (strncmp(sep, "bool:", 5) == 0) { sep += 5; - kvo.tag = LLAMA_KV_OVERRIDE_BOOL; + kvo.tag = LLAMA_KV_OVERRIDE_TYPE_BOOL; if (std::strcmp(sep, "true") == 0) { kvo.bool_value = true; } else if (std::strcmp(sep, "false") == 0) { diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp index e78ab185d89f3..7eafe8515a943 100644 --- a/examples/train-text-from-scratch/train-text-from-scratch.cpp +++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp @@ -960,7 +960,7 @@ int main(int argc, char ** argv) { struct ggml_opt_context * opt = train->opt; // set opt params from command line - opt->params = ggml_opt_default_params(GGML_OPT_ADAM); + opt->params = ggml_opt_default_params(GGML_OPT_TYPE_ADAM); opt->params.print_forward_graph = false; opt->params.print_backward_graph = false; opt->params.graph_size = LLAMA_TRAIN_MAX_NODES; diff --git a/ggml-cuda.cu b/ggml-cuda.cu index 21c612cb71b48..fb6d4f7d215b6 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -6369,11 +6369,11 @@ static __global__ void k_argsort_f32_i32(const float * x, int * dst, const int n int ixj = col ^ j; if (ixj > col) { if ((col & k) == 0) { - if (order == GGML_SORT_ASC ? x_row[dst_row[col]] > x_row[dst_row[ixj]] : x_row[dst_row[col]] < x_row[dst_row[ixj]]) { + if (order == GGML_SORT_ORDER_ASC ? x_row[dst_row[col]] > x_row[dst_row[ixj]] : x_row[dst_row[col]] < x_row[dst_row[ixj]]) { swap(dst_row[col], dst_row[ixj]); } } else { - if (order == GGML_SORT_ASC ? x_row[dst_row[col]] < x_row[dst_row[ixj]] : x_row[dst_row[col]] > x_row[dst_row[ixj]]) { + if (order == GGML_SORT_ORDER_ASC ? x_row[dst_row[col]] < x_row[dst_row[ixj]] : x_row[dst_row[col]] > x_row[dst_row[ixj]]) { swap(dst_row[col], dst_row[ixj]); } } @@ -7927,10 +7927,10 @@ static void argsort_f32_i32_cuda(const float * x, int * dst, const int ncols, co const dim3 block_dims(ncols, 1, 1); const dim3 block_nums(1, nrows, 1); - if (order == GGML_SORT_ASC) { - k_argsort_f32_i32<<>>(x, dst, ncols); - } else if (order == GGML_SORT_DESC) { - k_argsort_f32_i32<<>>(x, dst, ncols); + if (order == GGML_SORT_ORDER_ASC) { + k_argsort_f32_i32<<>>(x, dst, ncols); + } else if (order == GGML_SORT_ORDER_DESC) { + k_argsort_f32_i32<<>>(x, dst, ncols); } else { GGML_ASSERT(false); } @@ -8362,11 +8362,11 @@ static cudaError_t ggml_cuda_cpy_tensor_2d( cudaMemcpyKind kind; char * src_ptr; - if (src->backend == GGML_BACKEND_CPU) { + if (src->backend == GGML_BACKEND_TYPE_CPU) { kind = cudaMemcpyHostToDevice; src_ptr = (char *) src->data; - } else if (src->backend == GGML_BACKEND_GPU || src->backend == GGML_BACKEND_GPU_SPLIT) { - GGML_ASSERT(src->backend != GGML_BACKEND_GPU_SPLIT || (i1_low == 0 && i1_high == src->ne[1])); + } else if (src->backend == GGML_BACKEND_TYPE_GPU || src->backend == GGML_BACKEND_TYPE_GPU_SPLIT) { + GGML_ASSERT(src->backend != GGML_BACKEND_TYPE_GPU_SPLIT || (i1_low == 0 && i1_high == src->ne[1])); kind = cudaMemcpyDeviceToDevice; ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src->extra; int id; @@ -8771,7 +8771,7 @@ static void ggml_cuda_op_mul_mat_q( // the main device has a larger memory buffer to hold the results from all GPUs // nrows_dst == nrows of the matrix that the kernel writes into - const int64_t nrows_dst = dst->backend == GGML_BACKEND_GPU && id == g_main_device ? ne0 : row_diff; + const int64_t nrows_dst = dst->backend == GGML_BACKEND_TYPE_GPU && id == g_main_device ? ne0 : row_diff; switch (src0->type) { case GGML_TYPE_Q4_0: @@ -8920,7 +8920,7 @@ static void ggml_cuda_op_mul_mat_vec_q( // the main device has a larger memory buffer to hold the results from all GPUs // nrows_dst == nrows of the matrix that the kernel writes into - const int64_t nrows_dst = dst->backend == GGML_BACKEND_GPU && id == g_main_device ? ne0 : row_diff; + const int64_t nrows_dst = dst->backend == GGML_BACKEND_TYPE_GPU && id == g_main_device ? ne0 : row_diff; switch (src0->type) { case GGML_TYPE_Q4_0: @@ -9096,7 +9096,7 @@ static void ggml_cuda_op_mul_mat_cublas( // the main device has a larger memory buffer to hold the results from all GPUs // ldc == nrows of the matrix that cuBLAS writes into - int ldc = dst->backend == GGML_BACKEND_GPU && id == g_main_device ? ne0 : row_diff; + int ldc = dst->backend == GGML_BACKEND_TYPE_GPU && id == g_main_device ? ne0 : row_diff; const int compute_capability = g_device_caps[id].cc; @@ -9444,7 +9444,7 @@ static void ggml_cuda_op_soft_max( const bool use_src2 = src2 != nullptr; if (use_src2) { - const bool src2_on_device = src2->backend == GGML_BACKEND_GPU; + const bool src2_on_device = src2->backend == GGML_BACKEND_TYPE_GPU; if (src2_on_device) { ggml_tensor_extra_gpu * src2_extra = (ggml_tensor_extra_gpu *) src2->extra; @@ -9502,16 +9502,16 @@ static void ggml_cuda_op_flatten(const ggml_tensor * src0, const ggml_tensor * s const bool use_src1 = src1 != nullptr; const int64_t nrows1 = use_src1 ? ggml_nrows(src1) : 1; - GGML_ASSERT(!use_src1 || src1->backend != GGML_BACKEND_GPU_SPLIT); - GGML_ASSERT( dst->backend != GGML_BACKEND_GPU_SPLIT); + GGML_ASSERT(!use_src1 || src1->backend != GGML_BACKEND_TYPE_GPU_SPLIT); + GGML_ASSERT( dst->backend != GGML_BACKEND_TYPE_GPU_SPLIT); ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra; ggml_tensor_extra_gpu * src1_extra = use_src1 ? (ggml_tensor_extra_gpu *) src1->extra : nullptr; ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra; - const bool src0_on_device = src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT; - const bool src1_on_device = use_src1 && src1->backend == GGML_BACKEND_GPU; - const bool dst_on_device = dst->backend == GGML_BACKEND_GPU; + const bool src0_on_device = src0->backend == GGML_BACKEND_TYPE_GPU || src0->backend == GGML_BACKEND_TYPE_GPU_SPLIT; + const bool src1_on_device = use_src1 && src1->backend == GGML_BACKEND_TYPE_GPU; + const bool dst_on_device = dst->backend == GGML_BACKEND_TYPE_GPU; // dd = data device float * src0_ddf = nullptr; @@ -9555,7 +9555,7 @@ static void ggml_cuda_op_flatten(const ggml_tensor * src0, const ggml_tensor * s CUDA_CHECK(cudaMemcpyAsync(dst->data, dst_ddf, ggml_nbytes(dst), cudaMemcpyDeviceToHost, main_stream)); } - if (dst->backend == GGML_BACKEND_CPU) { + if (dst->backend == GGML_BACKEND_TYPE_CPU) { CUDA_CHECK(cudaDeviceSynchronize()); } } @@ -9636,8 +9636,8 @@ static void ggml_cuda_op_mul_mat( const int nb2 = dst->nb[2]; const int nb3 = dst->nb[3]; - GGML_ASSERT(dst->backend != GGML_BACKEND_GPU_SPLIT); - GGML_ASSERT(src1->backend != GGML_BACKEND_GPU_SPLIT); + GGML_ASSERT(dst->backend != GGML_BACKEND_TYPE_GPU_SPLIT); + GGML_ASSERT(src1->backend != GGML_BACKEND_TYPE_GPU_SPLIT); GGML_ASSERT(src1->type == GGML_TYPE_F32 || (src1->ne[2] == 1 && src1->ne[3] == 1)); GGML_ASSERT(ne12 >= ne02 && ne12 % ne02 == 0); @@ -9653,20 +9653,20 @@ static void ggml_cuda_op_mul_mat( ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra; ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra; - const bool src0_on_device = src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT; + const bool src0_on_device = src0->backend == GGML_BACKEND_TYPE_GPU || src0->backend == GGML_BACKEND_TYPE_GPU_SPLIT; const bool src0_is_contiguous = ggml_is_contiguous(src0); const bool src1_is_contiguous = ggml_is_contiguous(src1); const int64_t src1_padded_col_size = GGML_PAD(ne10, MATRIX_ROW_PADDING); - const bool split = src0->backend == GGML_BACKEND_GPU_SPLIT; + const bool split = src0->backend == GGML_BACKEND_TYPE_GPU_SPLIT; GGML_ASSERT(!(split && ne02 > 1)); GGML_ASSERT(!(split && ne03 > 1)); GGML_ASSERT(!(split && ne02 < ne12)); std::array tensor_split; if (split) { - // TODO: check that src0->buffer->buft is a split buffer type, replace GGML_BACKEND_GPU_SPLIT check + // TODO: check that src0->buffer->buft is a split buffer type, replace GGML_BACKEND_TYPE_GPU_SPLIT check // GGML_ASSERT(src0->buffer != nullptr && src0->buffer->buft == ...); ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *) src0->buffer->buft->context; tensor_split = buft_ctx->tensor_split; @@ -9724,8 +9724,8 @@ static void ggml_cuda_op_mul_mat( used_devices++; - const bool src1_on_device = src1->backend == GGML_BACKEND_GPU && id == g_main_device; - const bool dst_on_device = dst->backend == GGML_BACKEND_GPU && id == g_main_device; + const bool src1_on_device = src1->backend == GGML_BACKEND_TYPE_GPU && id == g_main_device; + const bool dst_on_device = dst->backend == GGML_BACKEND_TYPE_GPU && id == g_main_device; ggml_cuda_set_device(id); cudaStream_t stream = g_cudaStreams[id][0]; @@ -9776,8 +9776,8 @@ static void ggml_cuda_op_mul_mat( continue; } - const bool src1_on_device = src1->backend == GGML_BACKEND_GPU && id == g_main_device; - const bool dst_on_device = dst->backend == GGML_BACKEND_GPU && id == g_main_device; + const bool src1_on_device = src1->backend == GGML_BACKEND_TYPE_GPU && id == g_main_device; + const bool dst_on_device = dst->backend == GGML_BACKEND_TYPE_GPU && id == g_main_device; const int64_t row_diff = dev[id].row_high - dev[id].row_low; ggml_cuda_set_device(id); @@ -9802,12 +9802,12 @@ static void ggml_cuda_op_mul_mat( // the main device memory buffer can be on VRAM scratch, with space for all partial results // in that case an offset on dst_ddf_i is needed - if (dst->backend == GGML_BACKEND_GPU && id == g_main_device) { + if (dst->backend == GGML_BACKEND_TYPE_GPU && id == g_main_device) { dst_dd_i += dev[id].row_low; // offset is 0 if no tensor split } // copy src0, src1 to device if necessary - if (src1->backend == GGML_BACKEND_GPU && src1_is_contiguous) { + if (src1->backend == GGML_BACKEND_TYPE_GPU && src1_is_contiguous) { if (id != g_main_device) { if (convert_src1_to_q8_1) { char * src1_ddq_i_source = dev[g_main_device].src1_ddq + src1_ddq_i_offset; @@ -9820,14 +9820,14 @@ static void ggml_cuda_op_mul_mat( src1_ncols*ne10*sizeof(float), stream)); } } - } else if (src1->backend == GGML_BACKEND_CPU || (src1_on_device && !src1_is_contiguous)) { + } else if (src1->backend == GGML_BACKEND_TYPE_CPU || (src1_on_device && !src1_is_contiguous)) { CUDA_CHECK(ggml_cuda_cpy_tensor_2d( src1_ddf_i, src1, i03, i02, src1_col_0, src1_col_0+src1_ncols, stream)); } else { GGML_ASSERT(false); } - if (convert_src1_to_q8_1 && (src1->backend == GGML_BACKEND_CPU || !src1_is_contiguous)) { + if (convert_src1_to_q8_1 && (src1->backend == GGML_BACKEND_TYPE_CPU || !src1_is_contiguous)) { quantize_row_q8_1_cuda(src1_ddf_i, src1_ddq_i, ne10, src1_ncols, src1_padded_col_size, stream); CUDA_CHECK(cudaGetLastError()); } @@ -9845,10 +9845,10 @@ static void ggml_cuda_op_mul_mat( if (!dst_on_device) { void * dst_off_device; cudaMemcpyKind kind; - if (dst->backend == GGML_BACKEND_CPU) { + if (dst->backend == GGML_BACKEND_TYPE_CPU) { dst_off_device = dst->data; kind = cudaMemcpyDeviceToHost; - } else if (dst->backend == GGML_BACKEND_GPU) { + } else if (dst->backend == GGML_BACKEND_TYPE_GPU) { dst_off_device = dst_extra->data_device[g_main_device]; kind = cudaMemcpyDeviceToDevice; } else { @@ -9913,7 +9913,7 @@ static void ggml_cuda_op_mul_mat( } } - if (dst->backend == GGML_BACKEND_CPU) { + if (dst->backend == GGML_BACKEND_TYPE_CPU) { ggml_cuda_set_device(g_main_device); CUDA_CHECK(cudaDeviceSynchronize()); } @@ -10019,7 +10019,7 @@ GGML_CALL bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const stru static void ggml_cuda_mul_mat_vec_p021(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst){ GGML_ASSERT(ggml_is_permuted(src0) && ggml_is_permuted(src1)); - GGML_ASSERT(src0->backend != GGML_BACKEND_GPU_SPLIT); + GGML_ASSERT(src0->backend != GGML_BACKEND_TYPE_GPU_SPLIT); GGML_ASSERT(src0->nb[0] <= src0->nb[1] && src0->nb[2] <= src0->nb[3]); // 0213 permutation GGML_ASSERT(src1->nb[0] <= src1->nb[1] && src1->nb[2] <= src1->nb[3]); // 0213 permutation GGML_ASSERT(src0->type == GGML_TYPE_F16); @@ -10050,7 +10050,7 @@ static void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor GGML_ASSERT(!ggml_is_transposed(src0)); GGML_ASSERT(!ggml_is_transposed(src1)); GGML_ASSERT(!ggml_is_permuted(src0)); - GGML_ASSERT(src0->backend != GGML_BACKEND_GPU_SPLIT); + GGML_ASSERT(src0->backend != GGML_BACKEND_TYPE_GPU_SPLIT); GGML_ASSERT(src0->type == GGML_TYPE_F16); GGML_ASSERT(src1->type == GGML_TYPE_F32); @@ -10109,7 +10109,7 @@ static void ggml_cuda_mul_mat_batched_cublas(const ggml_tensor * src0, const ggm GGML_ASSERT(!ggml_is_transposed(src0)); GGML_ASSERT(!ggml_is_transposed(src1)); - GGML_ASSERT(src0->backend != GGML_BACKEND_GPU_SPLIT); + GGML_ASSERT(src0->backend != GGML_BACKEND_TYPE_GPU_SPLIT); GGML_ASSERT(src0->type == GGML_TYPE_F16); GGML_TENSOR_BINARY_OP_LOCALS @@ -10255,11 +10255,11 @@ static void ggml_cuda_mul_mat_batched_cublas(const ggml_tensor * src0, const ggm static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { const bool all_on_device = - (src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT) && - (src1->backend == GGML_BACKEND_GPU) && - ( dst->backend == GGML_BACKEND_GPU); + (src0->backend == GGML_BACKEND_TYPE_GPU || src0->backend == GGML_BACKEND_TYPE_GPU_SPLIT) && + (src1->backend == GGML_BACKEND_TYPE_GPU) && + ( dst->backend == GGML_BACKEND_TYPE_GPU); - const bool split = src0->backend == GGML_BACKEND_GPU_SPLIT; + const bool split = src0->backend == GGML_BACKEND_TYPE_GPU_SPLIT; int64_t min_compute_capability = INT_MAX; @@ -10409,7 +10409,7 @@ static void ggml_cuda_mul_mat_id_cublas(ggml_tensor * dst) { GGML_ASSERT(!ggml_is_transposed(src00)); GGML_ASSERT(!ggml_is_transposed(src1)); - GGML_ASSERT(src00->backend != GGML_BACKEND_GPU_SPLIT); + GGML_ASSERT(src00->backend != GGML_BACKEND_TYPE_GPU_SPLIT); GGML_ASSERT(src1->type == GGML_TYPE_F32); const int64_t ne00 = src00->ne[0]; GGML_UNUSED(ne00); @@ -10553,7 +10553,7 @@ static void ggml_cuda_mul_mat_id(const ggml_tensor * src0, const ggml_tensor * s cudaStream_t stream = g_cudaStreams[g_main_device][0]; - if (ids->backend == GGML_BACKEND_GPU) { + if (ids->backend == GGML_BACKEND_TYPE_GPU) { const char * ids_dev = (const char *)((const ggml_tensor_extra_gpu *)ids->extra)->data_device[g_main_device]; CUDA_CHECK(cudaMemcpyAsync(ids_host.data(), ids_dev, ggml_nbytes(ids), cudaMemcpyDeviceToHost, stream)); CUDA_CHECK(cudaStreamSynchronize(stream)); @@ -10570,20 +10570,20 @@ static void ggml_cuda_mul_mat_id(const ggml_tensor * src0, const ggml_tensor * s ggml_tensor src1_row = *src1; ggml_tensor dst_row = *dst; - src1_row.backend = GGML_BACKEND_GPU; - dst_row.backend = GGML_BACKEND_GPU; + src1_row.backend = GGML_BACKEND_TYPE_GPU; + dst_row.backend = GGML_BACKEND_TYPE_GPU; src1_row.extra = &src1_row_extra; dst_row.extra = &dst_row_extra; - char * src1_original = src1->backend == GGML_BACKEND_CPU ? + char * src1_original = src1->backend == GGML_BACKEND_TYPE_CPU ? (char *) src1->data : (char *) src1_extra->data_device[g_main_device]; - char * dst_original = dst->backend == GGML_BACKEND_CPU ? + char * dst_original = dst->backend == GGML_BACKEND_TYPE_CPU ? (char *) dst->data : (char *) dst_extra->data_device[g_main_device]; if (src1->ne[1] == 1) { - GGML_ASSERT(src1->backend == GGML_BACKEND_GPU); - GGML_ASSERT(dst->backend == GGML_BACKEND_GPU); + GGML_ASSERT(src1->backend == GGML_BACKEND_TYPE_GPU); + GGML_ASSERT(dst->backend == GGML_BACKEND_TYPE_GPU); for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) { //int32_t row_id; @@ -10611,9 +10611,9 @@ static void ggml_cuda_mul_mat_id(const ggml_tensor * src0, const ggml_tensor * s src1_row_extra.data_device[g_main_device] = src1_contiguous.get(); dst_row_extra.data_device[g_main_device] = dst_contiguous.get(); - const cudaMemcpyKind src1_kind = src1->backend == GGML_BACKEND_CPU ? + const cudaMemcpyKind src1_kind = src1->backend == GGML_BACKEND_TYPE_CPU ? cudaMemcpyHostToDevice : cudaMemcpyDeviceToDevice; - const cudaMemcpyKind dst_kind = dst->backend == GGML_BACKEND_CPU ? + const cudaMemcpyKind dst_kind = dst->backend == GGML_BACKEND_TYPE_CPU ? cudaMemcpyDeviceToHost : cudaMemcpyDeviceToDevice; for (int32_t row_id = 0; row_id < n_as; ++row_id) { @@ -10668,7 +10668,7 @@ static void ggml_cuda_mul_mat_id(const ggml_tensor * src0, const ggml_tensor * s } } - if (dst->backend == GGML_BACKEND_CPU) { + if (dst->backend == GGML_BACKEND_TYPE_CPU) { CUDA_CHECK(cudaStreamSynchronize(stream)); } } @@ -10685,8 +10685,8 @@ static void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, gg const int64_t ne = ggml_nelements(src0); GGML_ASSERT(ne == ggml_nelements(src1)); - GGML_ASSERT(src0->backend == GGML_BACKEND_GPU); - GGML_ASSERT(src1->backend == GGML_BACKEND_GPU); + GGML_ASSERT(src0->backend == GGML_BACKEND_TYPE_GPU); + GGML_ASSERT(src1->backend == GGML_BACKEND_TYPE_GPU); GGML_ASSERT(ggml_nbytes(src0) <= INT_MAX); GGML_ASSERT(ggml_nbytes(src1) <= INT_MAX); @@ -10817,9 +10817,9 @@ GGML_CALL bool ggml_cuda_compute_forward(struct ggml_compute_params * params, st if (!g_cublas_loaded) return false; ggml_cuda_func_t func; - const bool any_on_device = tensor->backend == GGML_BACKEND_GPU - || (tensor->src[0] != nullptr && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT)) - || (tensor->src[1] != nullptr && tensor->src[1]->backend == GGML_BACKEND_GPU); + const bool any_on_device = tensor->backend == GGML_BACKEND_TYPE_GPU + || (tensor->src[0] != nullptr && (tensor->src[0]->backend == GGML_BACKEND_TYPE_GPU || tensor->src[0]->backend == GGML_BACKEND_TYPE_GPU_SPLIT)) + || (tensor->src[1] != nullptr && tensor->src[1]->backend == GGML_BACKEND_TYPE_GPU); if (!any_on_device && tensor->op != GGML_OP_MUL_MAT && tensor->op != GGML_OP_MUL_MAT_ID) { return false; @@ -10966,14 +10966,14 @@ GGML_CALL bool ggml_cuda_compute_forward(struct ggml_compute_params * params, st return false; } - if (tensor->src[0] != nullptr && tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT) { + if (tensor->src[0] != nullptr && tensor->src[0]->backend == GGML_BACKEND_TYPE_GPU_SPLIT) { ggml_cuda_set_peer_access(tensor->src[1]->ne[1]); } if (params->ith != 0) { return true; } - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return true; } func(tensor->src[0], tensor->src[1], tensor); @@ -11072,7 +11072,7 @@ GGML_CALL static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t extra->data_device[ctx->device] = tensor->data; - tensor->backend = GGML_BACKEND_GPU; + tensor->backend = GGML_BACKEND_TYPE_GPU; tensor->extra = extra; if (ggml_is_quantized(tensor->type)) { @@ -11087,7 +11087,7 @@ GGML_CALL static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t } GGML_CALL static void ggml_backend_cuda_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) { - GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU); + GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU); ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context; @@ -11098,7 +11098,7 @@ GGML_CALL static void ggml_backend_cuda_buffer_set_tensor(ggml_backend_buffer_t } GGML_CALL static void ggml_backend_cuda_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) { - GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU); + GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU); ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context; @@ -11333,7 +11333,7 @@ GGML_CALL static void ggml_backend_cuda_split_buffer_init_tensor(ggml_backend_bu CUDA_CHECK(cudaEventCreateWithFlags(&extra->events[id][is], cudaEventDisableTiming)); } } - tensor->backend = GGML_BACKEND_GPU_SPLIT; + tensor->backend = GGML_BACKEND_TYPE_GPU_SPLIT; tensor->extra = extra; } @@ -11605,7 +11605,7 @@ GGML_CALL static void ggml_backend_cuda_set_tensor_async(ggml_backend_t backend, ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context; GGML_ASSERT(tensor->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type"); - GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU); + GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU); CUDA_CHECK(cudaMemcpyAsync((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice, g_cudaStreams[cuda_ctx->device][0])); } @@ -11614,7 +11614,7 @@ GGML_CALL static void ggml_backend_cuda_get_tensor_async(ggml_backend_t backend, ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context; GGML_ASSERT(tensor->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type"); - GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU); + GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU); CUDA_CHECK(cudaMemcpyAsync(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost, g_cudaStreams[cuda_ctx->device][0])); } @@ -11644,7 +11644,7 @@ GGML_CALL static bool ggml_backend_cuda_graph_compute(ggml_backend_t backend, gg ggml_cuda_set_main_device(cuda_ctx->device); ggml_compute_params params = {}; - params.type = GGML_TASK_COMPUTE; + params.type = GGML_TASK_TYPE_COMPUTE; params.ith = 0; for (int i = 0; i < cgraph->n_nodes; i++) { ggml_tensor * node = cgraph->nodes[i]; @@ -11654,13 +11654,13 @@ GGML_CALL static bool ggml_backend_cuda_graph_compute(ggml_backend_t backend, gg } #ifndef NDEBUG - assert(node->backend == GGML_BACKEND_GPU || node->backend == GGML_BACKEND_GPU_SPLIT); + assert(node->backend == GGML_BACKEND_TYPE_GPU || node->backend == GGML_BACKEND_TYPE_GPU_SPLIT); assert(node->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device)); assert(node->extra != nullptr); for (int j = 0; j < GGML_MAX_SRC; j++) { if (node->src[j] != nullptr) { - assert(node->src[j]->backend == GGML_BACKEND_GPU || node->src[j]->backend == GGML_BACKEND_GPU_SPLIT); + assert(node->src[j]->backend == GGML_BACKEND_TYPE_GPU || node->src[j]->backend == GGML_BACKEND_TYPE_GPU_SPLIT); assert(node->src[j]->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) || ggml_backend_buffer_is_cuda_split(node->src[j]->buffer)); assert(node->src[j]->extra != nullptr); } diff --git a/ggml-metal.m b/ggml-metal.m index ee584cfa71ce7..3d6b01263acb5 100644 --- a/ggml-metal.m +++ b/ggml-metal.m @@ -2262,8 +2262,8 @@ static bool ggml_metal_graph_compute( id pipeline = nil; switch (order) { - case GGML_SORT_ASC: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC].pipeline; break; - case GGML_SORT_DESC: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_DESC].pipeline; break; + case GGML_SORT_ORDER_ASC: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC].pipeline; break; + case GGML_SORT_ORDER_DESC: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_DESC].pipeline; break; default: GGML_ASSERT(false); }; diff --git a/ggml-opencl.cpp b/ggml-opencl.cpp index 797bee66799b5..df619a884842c 100644 --- a/ggml-opencl.cpp +++ b/ggml-opencl.cpp @@ -1354,7 +1354,7 @@ static void ggml_cl_pool_free(cl_mem mem, size_t size) { } void ggml_cl_free_data(const struct ggml_tensor* tensor) { - if (tensor->backend != GGML_BACKEND_GPU) { + if (tensor->backend != GGML_BACKEND_TYPE_GPU) { return; } @@ -1412,7 +1412,7 @@ static cl_int ggml_cl_h2d_tensor_2d(cl_command_queue queue, cl_mem dst, size_t o } static void ggml_cl_mul_f32(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - GGML_ASSERT(src1->backend == GGML_BACKEND_GPU); + GGML_ASSERT(src1->backend == GGML_BACKEND_TYPE_GPU); const int64_t ne00 = src0->ne[0]; const int64_t ne01 = src0->ne[1]; const int64_t ne02 = src0->ne[2]; @@ -1476,7 +1476,7 @@ void ggml_cl_mul(const struct ggml_tensor * src0, const struct ggml_tensor * src } static void ggml_cl_add_f32(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - GGML_ASSERT(src1->backend == GGML_BACKEND_GPU); + GGML_ASSERT(src1->backend == GGML_BACKEND_TYPE_GPU); const int64_t ne00 = src0->ne[0]; const int64_t ne01 = src0->ne[1]; const int64_t ne02 = src0->ne[2]; @@ -1566,13 +1566,13 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr size_t y_size; size_t d_size; cl_mem d_X; - if (src0->backend == GGML_BACKEND_GPU) { // NOLINT + if (src0->backend == GGML_BACKEND_TYPE_GPU) { // NOLINT d_X = (cl_mem) src0->extra; } else { d_X = ggml_cl_pool_malloc(sizeof(float) * x_ne, &x_size); } - cl_mem d_Y = src1->backend == GGML_BACKEND_GPU ? (cl_mem) src1->extra : ggml_cl_pool_malloc(sizeof(float) * y_ne, &y_size); - cl_mem d_D = dst->backend == GGML_BACKEND_GPU ? (cl_mem) dst->extra : ggml_cl_pool_malloc(sizeof(float) * d_ne, &d_size); + cl_mem d_Y = src1->backend == GGML_BACKEND_TYPE_GPU ? (cl_mem) src1->extra : ggml_cl_pool_malloc(sizeof(float) * y_ne, &y_size); + cl_mem d_D = dst->backend == GGML_BACKEND_TYPE_GPU ? (cl_mem) dst->extra : ggml_cl_pool_malloc(sizeof(float) * d_ne, &d_size); size_t x_offset = 0; @@ -1580,7 +1580,7 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr // TODO: copy src0 here when r3>1 for (int64_t i13 = i03 * r3, e13 = i13 + r3; i13 < e13; i13++) { for (int64_t i02 = 0; i02 < ne02; i02++) { - if (src0->backend == GGML_BACKEND_GPU) { + if (src0->backend == GGML_BACKEND_TYPE_GPU) { x_offset = (i03 * ne02 + i02) * x_ne; } else { // copy src0 to device @@ -1589,7 +1589,7 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr for (int64_t i12 = i02 * r2, e12 = i12 + r2; i12 < e12; i12++) { // copy src1 to device - if (src1->backend == GGML_BACKEND_CPU) { + if (src1->backend == GGML_BACKEND_TYPE_CPU) { CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i13, i12, NULL)); } @@ -1612,7 +1612,7 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr } // copy dst to host - if (dst->backend == GGML_BACKEND_CPU) { + if (dst->backend == GGML_BACKEND_TYPE_CPU) { float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3); CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(float) * d_ne, d, 1, &ev_sgemm, NULL)); } @@ -1621,13 +1621,13 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr } } - if (src0->backend != GGML_BACKEND_GPU) { + if (src0->backend != GGML_BACKEND_TYPE_GPU) { ggml_cl_pool_free(d_X, x_size); } - if (src1->backend != GGML_BACKEND_GPU) { + if (src1->backend != GGML_BACKEND_TYPE_GPU) { ggml_cl_pool_free(d_Y, y_size); } - if (dst->backend != GGML_BACKEND_GPU) { + if (dst->backend != GGML_BACKEND_TYPE_GPU) { ggml_cl_pool_free(d_D, d_size); } } @@ -1670,7 +1670,7 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr size_t y_size; size_t d_size; cl_mem d_X; - if (src0->backend == GGML_BACKEND_GPU) { // NOLINT + if (src0->backend == GGML_BACKEND_TYPE_GPU) { // NOLINT d_X = (cl_mem) src0->extra; } else { d_X = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * x_ne, &x_size); @@ -1687,7 +1687,7 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr // TODO: copy src0 here when r3>1 for (int64_t i13 = i03 * r3, e13 = i13 + r3; i13 < e13; i13++) { for (int64_t i02 = 0; i02 < ne02; i02++) { - if (src0->backend == GGML_BACKEND_GPU) { + if (src0->backend == GGML_BACKEND_TYPE_GPU) { x_offset = (i03 * ne02 + i02) * x_ne; } else { // copy src0 to device @@ -1741,7 +1741,7 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr } // copy dst to host, then convert to float - if (dst->backend == GGML_BACKEND_CPU) { + if (dst->backend == GGML_BACKEND_TYPE_CPU) { CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(ggml_fp16_t) * d_ne, tmp, 1, &ev_sgemm, NULL)); float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3); ggml_fp16_to_fp32_row(tmp, d, d_ne); @@ -1753,7 +1753,7 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr } } - if (src0->backend != GGML_BACKEND_GPU) { + if (src0->backend != GGML_BACKEND_TYPE_GPU) { ggml_cl_pool_free(d_X, x_size); } ggml_cl_pool_free(d_Y, y_size); @@ -1798,7 +1798,7 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor * cl_mem d_Y = ggml_cl_pool_malloc(sizeof(float) * y_ne, &y_size); cl_mem d_D = ggml_cl_pool_malloc(sizeof(float) * d_ne, &d_size); cl_mem d_Q; - if (src0->backend == GGML_BACKEND_CPU) { + if (src0->backend == GGML_BACKEND_TYPE_CPU) { d_Q = ggml_cl_pool_malloc(q_sz, &q_size); } @@ -1817,10 +1817,10 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor * for (int64_t i13 = i03 * r3, e13 = i13 + r3; i13 < e13; i13++) { for (int64_t i02 = 0; i02 < ne02; i02++) { // copy src0 to device if necessary - if (src0->backend == GGML_BACKEND_CPU) { + if (src0->backend == GGML_BACKEND_TYPE_CPU) { events.emplace_back(); CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Q, 0, src0, i03, i02, events.data() + ev_idx++)); - } else if (src0->backend == GGML_BACKEND_GPU) { + } else if (src0->backend == GGML_BACKEND_TYPE_GPU) { d_Q = (cl_mem) src0->extra; } else { GGML_ASSERT(false); @@ -1829,7 +1829,7 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor * if (!mul_mat_vec) { // convert src0 to fp32 on device const size_t global = x_ne / global_denom; - const size_t offset = src0->backend == GGML_BACKEND_GPU ? (i03 * ne02 + i02) * x_bps : 0; + const size_t offset = src0->backend == GGML_BACKEND_TYPE_GPU ? (i03 * ne02 + i02) * x_bps : 0; CL_CHECK(clSetKernelArg(*to_fp32_cl, 0, sizeof(cl_mem), &d_Q)); CL_CHECK(clSetKernelArg(*to_fp32_cl, 1, sizeof(cl_mem), &d_X)); CL_CHECK(clEnqueueNDRangeKernel(queue, *to_fp32_cl, 1, &offset, &global, local > 0 ? &local : NULL, events.size(), !events.empty() ? events.data() : NULL, NULL)); @@ -1843,7 +1843,7 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor * // compute const size_t global = ne01 * local; - const size_t offset = src0->backend == GGML_BACKEND_GPU ? (i03 * ne02 + i02) * x_bps : 0; + const size_t offset = src0->backend == GGML_BACKEND_TYPE_GPU ? (i03 * ne02 + i02) * x_bps : 0; const cl_int ncols = ne00; events.emplace_back(); CL_CHECK(clSetKernelArg(*dmmv, 0, sizeof(cl_mem), &d_Q)); @@ -1895,7 +1895,7 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor * } ggml_cl_pool_free(d_Y, y_size); ggml_cl_pool_free(d_D, d_size); - if (src0->backend == GGML_BACKEND_CPU) { + if (src0->backend == GGML_BACKEND_TYPE_CPU) { ggml_cl_pool_free(d_Q, q_size); } } @@ -1911,7 +1911,7 @@ bool ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tens if ((src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32 && - ((ne0 >= 32 && ne1 >= 32 && ne10 >= 32) || src0->backend == GGML_BACKEND_GPU)) { + ((ne0 >= 32 && ne1 >= 32 && ne10 >= 32) || src0->backend == GGML_BACKEND_TYPE_GPU)) { return true; } @@ -1993,7 +1993,7 @@ void ggml_cl_transform_tensor(void * data, ggml_tensor * tensor) { CL_CHECK(clFinish(queue)); tensor->extra = dst; - GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU); + GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU); } // ggml-backend @@ -2045,7 +2045,7 @@ static void ggml_backend_opencl_buffer_init_tensor(ggml_backend_buffer_t buffer, ctx->sub_buffers.push_back(sub_buffer); tensor->extra = sub_buffer; } - tensor->backend = GGML_BACKEND_GPU; + tensor->backend = GGML_BACKEND_TYPE_GPU; } static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) { diff --git a/ggml-sycl.cpp b/ggml-sycl.cpp index b897828f967b1..c6c3c6e6fef07 100644 --- a/ggml-sycl.cpp +++ b/ggml-sycl.cpp @@ -3338,7 +3338,7 @@ void print_ggml_tensor(const char*name, struct ggml_tensor *src){ size_t total_elements = ggml_nelements(src); - const bool src_on_device = src->backend == GGML_BACKEND_GPU || src->backend == GGML_BACKEND_GPU_SPLIT; + const bool src_on_device = src->backend == GGML_BACKEND_TYPE_GPU || src->backend == GGML_BACKEND_TYPE_GPU_SPLIT; float *src_data =NULL; if(src_on_device) { ggml_tensor_extra_gpu * src_extra = (ggml_tensor_extra_gpu *) src->extra; @@ -8086,11 +8086,11 @@ static void k_argsort_f32_i32(const float * x, int * dst, const int ncols, int ixj = col ^ j; if (ixj > col) { if ((col & k) == 0) { - if (order == GGML_SORT_ASC ? x_row[dst_row[col]] > x_row[dst_row[ixj]] : x_row[dst_row[col]] < x_row[dst_row[ixj]]) { + if (order == GGML_SORT_ORDER_ASC ? x_row[dst_row[col]] > x_row[dst_row[ixj]] : x_row[dst_row[col]] < x_row[dst_row[ixj]]) { swap(dst_row[col], dst_row[ixj]); } } else { - if (order == GGML_SORT_ASC ? x_row[dst_row[col]] < x_row[dst_row[ixj]] : x_row[dst_row[col]] > x_row[dst_row[ixj]]) { + if (order == GGML_SORT_ORDER_ASC ? x_row[dst_row[col]] < x_row[dst_row[ixj]] : x_row[dst_row[col]] > x_row[dst_row[ixj]]) { swap(dst_row[col], dst_row[ixj]); } } @@ -10825,7 +10825,7 @@ static void argsort_f32_i32_sycl(const float *x, int *dst, const int ncols, const sycl::range<3> block_dims(1, 1, ncols); const sycl::range<3> block_nums(1, nrows, 1); - if (order == GGML_SORT_ASC) { + if (order == GGML_SORT_ORDER_ASC) { /* DPCT1049:44: The work-group size passed to the SYCL kernel may exceed the limit. To get the device limit, query @@ -10834,9 +10834,9 @@ static void argsort_f32_i32_sycl(const float *x, int *dst, const int ncols, stream->parallel_for( sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) { - k_argsort_f32_i32(x, dst, ncols, item_ct1); + k_argsort_f32_i32(x, dst, ncols, item_ct1); }); - } else if (order == GGML_SORT_DESC) { + } else if (order == GGML_SORT_ORDER_DESC) { /* DPCT1049:45: The work-group size passed to the SYCL kernel may exceed the limit. To get the device limit, query @@ -10845,7 +10845,7 @@ static void argsort_f32_i32_sycl(const float *x, int *dst, const int ncols, stream->parallel_for( sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) { - k_argsort_f32_i32(x, dst, ncols, item_ct1); + k_argsort_f32_i32(x, dst, ncols, item_ct1); }); } else { GGML_ASSERT(false); @@ -11407,12 +11407,12 @@ static dpct::err0 ggml_sycl_cpy_tensor_2d(void *dst, dpct::memcpy_direction kind; char * src_ptr; - if (src->backend == GGML_BACKEND_CPU) { + if (src->backend == GGML_BACKEND_TYPE_CPU) { kind = dpct::host_to_device; src_ptr = (char *) src->data; - // GGML_SYCL_DEBUG("ggml_sycl_cpy_tensor_2d GGML_BACKEND_CPU src_ptr %p\n", src_ptr); - } else if (src->backend == GGML_BACKEND_GPU || src->backend == GGML_BACKEND_GPU_SPLIT) { - GGML_ASSERT(src->backend != GGML_BACKEND_GPU_SPLIT || (i1_low == 0 && i1_high == src->ne[1])); + // GGML_SYCL_DEBUG("ggml_sycl_cpy_tensor_2d GGML_BACKEND_TYPE_CPU src_ptr %p\n", src_ptr); + } else if (src->backend == GGML_BACKEND_TYPE_GPU || src->backend == GGML_BACKEND_TYPE_GPU_SPLIT) { + GGML_ASSERT(src->backend != GGML_BACKEND_TYPE_GPU_SPLIT || (i1_low == 0 && i1_high == src->ne[1])); kind = dpct::device_to_device; ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src->extra; int id; @@ -11846,7 +11846,7 @@ inline void ggml_sycl_op_mul_mat_q( // the main device has a larger memory buffer to hold the results from all GPUs // nrows_dst == nrows of the matrix that the dequantize_mul_mat kernel writes into - const int64_t nrows_dst = dst->backend == GGML_BACKEND_GPU && device_id == g_main_device ? ne0 : row_diff; + const int64_t nrows_dst = dst->backend == GGML_BACKEND_TYPE_GPU && device_id == g_main_device ? ne0 : row_diff; switch (src0->type) { case GGML_TYPE_Q4_0: @@ -12119,7 +12119,7 @@ inline void ggml_sycl_op_mul_mat_sycl( // the main device has a larger memory buffer to hold the results from all GPUs // ldc == nrows of the matrix that cuBLAS writes into - int ldc = dst->backend == GGML_BACKEND_GPU && device_id == g_main_device ? ne0 : row_diff; + int ldc = dst->backend == GGML_BACKEND_TYPE_GPU && device_id == g_main_device ? ne0 : row_diff; #ifdef GGML_SYCL_F16 bool use_fp16 = true; // TODO(Yu) SYCL capability check @@ -12501,16 +12501,16 @@ static void ggml_sycl_op_flatten(const ggml_tensor *src0, const bool use_src1 = src1 != nullptr; const int64_t nrows1 = use_src1 ? ggml_nrows(src1) : 1; - GGML_ASSERT(!use_src1 || src1->backend != GGML_BACKEND_GPU_SPLIT); - GGML_ASSERT( dst->backend != GGML_BACKEND_GPU_SPLIT); + GGML_ASSERT(!use_src1 || src1->backend != GGML_BACKEND_TYPE_GPU_SPLIT); + GGML_ASSERT( dst->backend != GGML_BACKEND_TYPE_GPU_SPLIT); ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra; ggml_tensor_extra_gpu * src1_extra = use_src1 ? (ggml_tensor_extra_gpu *) src1->extra : nullptr; ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra; - const bool src0_on_device = src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT; - const bool src1_on_device = use_src1 && src1->backend == GGML_BACKEND_GPU; - const bool dst_on_device = dst->backend == GGML_BACKEND_GPU; + const bool src0_on_device = src0->backend == GGML_BACKEND_TYPE_GPU || src0->backend == GGML_BACKEND_TYPE_GPU_SPLIT; + const bool src1_on_device = use_src1 && src1->backend == GGML_BACKEND_TYPE_GPU; + const bool dst_on_device = dst->backend == GGML_BACKEND_TYPE_GPU; // dd = data device float * src0_ddf = nullptr; @@ -12565,7 +12565,7 @@ static void ggml_sycl_op_flatten(const ggml_tensor *src0, main_stream->memcpy(dst->data, dst_ddf, ggml_nbytes(dst)))); } - if (dst->backend == GGML_BACKEND_CPU) { + if (dst->backend == GGML_BACKEND_TYPE_CPU) { SYCL_CHECK(CHECK_TRY_ERROR( dpct::get_current_device().queues_wait_and_throw())); } @@ -12640,8 +12640,8 @@ static void ggml_sycl_op_mul_mat(const ggml_tensor *src0, const int nb2 = dst->nb[2]; const int nb3 = dst->nb[3]; - GGML_ASSERT(dst->backend != GGML_BACKEND_GPU_SPLIT); - GGML_ASSERT(src1->backend != GGML_BACKEND_GPU_SPLIT); + GGML_ASSERT(dst->backend != GGML_BACKEND_TYPE_GPU_SPLIT); + GGML_ASSERT(src1->backend != GGML_BACKEND_TYPE_GPU_SPLIT); GGML_ASSERT(ne12 >= ne02 && ne12 % ne02 == 0); @@ -12656,13 +12656,13 @@ static void ggml_sycl_op_mul_mat(const ggml_tensor *src0, ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra; ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra; - const bool src0_on_device = src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT; + const bool src0_on_device = src0->backend == GGML_BACKEND_TYPE_GPU || src0->backend == GGML_BACKEND_TYPE_GPU_SPLIT; const bool src0_is_contiguous = ggml_is_contiguous(src0); const bool src1_is_contiguous = ggml_is_contiguous(src1); int64_t src1_padded_col_size = GGML_PAD(ne10, MATRIX_ROW_PADDING); - const bool split = src0->backend == GGML_BACKEND_GPU_SPLIT; + const bool split = src0->backend == GGML_BACKEND_TYPE_GPU_SPLIT; GGML_ASSERT(!(split && ne02 > 1)); GGML_ASSERT(!(split && ne03 > 1)); GGML_ASSERT(!(split && ne02 < ne12)); @@ -12717,8 +12717,8 @@ static void ggml_sycl_op_mul_mat(const ggml_tensor *src0, used_devices++; - const bool src1_on_device = src1->backend == GGML_BACKEND_GPU && id == g_main_device_index; - const bool dst_on_device = dst->backend == GGML_BACKEND_GPU && id == g_main_device_index; + const bool src1_on_device = src1->backend == GGML_BACKEND_TYPE_GPU && id == g_main_device_index; + const bool dst_on_device = dst->backend == GGML_BACKEND_TYPE_GPU && id == g_main_device_index; ggml_sycl_set_device(get_device_id_by_index(id)); const dpct::queue_ptr stream = g_syclStreams[id][0]; @@ -12782,8 +12782,8 @@ static void ggml_sycl_op_mul_mat(const ggml_tensor *src0, continue; } - const bool src1_on_device = src1->backend == GGML_BACKEND_GPU && id == g_main_device_index; - const bool dst_on_device = dst->backend == GGML_BACKEND_GPU && id == g_main_device_index; + const bool src1_on_device = src1->backend == GGML_BACKEND_TYPE_GPU && id == g_main_device_index; + const bool dst_on_device = dst->backend == GGML_BACKEND_TYPE_GPU && id == g_main_device_index; const int64_t row_diff = row_high[id] - row_low[id]; ggml_sycl_set_device(get_device_id_by_index(id)); @@ -12809,12 +12809,12 @@ static void ggml_sycl_op_mul_mat(const ggml_tensor *src0, // the main device memory buffer can be on VRAM scratch, with space for all partial results // in that case an offset on dst_ddf_i is needed - if (dst->backend == GGML_BACKEND_GPU && id == g_main_device_index) { + if (dst->backend == GGML_BACKEND_TYPE_GPU && id == g_main_device_index) { dst_dd_i += row_low[id]; // offset is 0 if no tensor split } // copy src0, src1 to device if necessary - if (src1->backend == GGML_BACKEND_GPU && src1_is_contiguous) { + if (src1->backend == GGML_BACKEND_TYPE_GPU && src1_is_contiguous) { if (id != g_main_device_index) { if (convert_src1_to_q8_1) { char * src1_ddq_i_source = src1_ddq[g_main_device_index] + src1_ddq_i_offset; @@ -12830,14 +12830,14 @@ static void ggml_sycl_op_mul_mat(const ggml_tensor *src0, src1_ncols * ne10 * sizeof(float)))); } } - } else if (src1->backend == GGML_BACKEND_CPU || (src1_on_device && !src1_is_contiguous)) { + } else if (src1->backend == GGML_BACKEND_TYPE_CPU || (src1_on_device && !src1_is_contiguous)) { SYCL_CHECK(ggml_sycl_cpy_tensor_2d( src1_ddf_i, src1, i03, i02, src1_col_0, src1_col_0+src1_ncols, stream)); } else { GGML_ASSERT(false); } - if (convert_src1_to_q8_1 && (src1->backend == GGML_BACKEND_CPU || !src1_is_contiguous)) { + if (convert_src1_to_q8_1 && (src1->backend == GGML_BACKEND_TYPE_CPU || !src1_is_contiguous)) { quantize_row_q8_1_sycl(src1_ddf_i, src1_ddq_i, ne10, src1_ncols, src1_padded_col_size, stream); /* DPCT1010:92: SYCL uses exceptions to report errors and does @@ -12867,10 +12867,10 @@ static void ggml_sycl_op_mul_mat(const ggml_tensor *src0, if (!dst_on_device) { void * dst_off_device; dpct::memcpy_direction kind; - if (dst->backend == GGML_BACKEND_CPU) { + if (dst->backend == GGML_BACKEND_TYPE_CPU) { dst_off_device = dst->data; kind = dpct::device_to_host; - } else if (dst->backend == GGML_BACKEND_GPU) { + } else if (dst->backend == GGML_BACKEND_TYPE_GPU) { dst_off_device = dst_extra->data_device[g_main_device_index]; kind = dpct::device_to_device; } else { @@ -12954,7 +12954,7 @@ static void ggml_sycl_op_mul_mat(const ggml_tensor *src0, } } - if (dst->backend == GGML_BACKEND_CPU) { + if (dst->backend == GGML_BACKEND_TYPE_CPU) { SYCL_CHECK(ggml_sycl_set_device(g_main_device)); SYCL_CHECK(CHECK_TRY_ERROR( dpct::get_current_device().queues_wait_and_throw())); @@ -13091,7 +13091,7 @@ static void ggml_sycl_mul_mat_vec_p021(const ggml_tensor *src0, const ggml_tensor *src1, ggml_tensor *dst) try { GGML_ASSERT(ggml_is_permuted(src0) && ggml_is_permuted(src1)); - GGML_ASSERT(src0->backend != GGML_BACKEND_GPU_SPLIT); + GGML_ASSERT(src0->backend != GGML_BACKEND_TYPE_GPU_SPLIT); GGML_ASSERT(src0->nb[0] <= src0->nb[1] && src0->nb[2] <= src0->nb[3]); // 0213 permutation GGML_ASSERT(src1->nb[0] <= src1->nb[1] && src1->nb[2] <= src1->nb[3]); // 0213 permutation GGML_ASSERT(src0->type == GGML_TYPE_F16); @@ -13129,7 +13129,7 @@ static void ggml_sycl_mul_mat_vec_nc(const ggml_tensor *src0, GGML_ASSERT(!ggml_is_transposed(src0)); GGML_ASSERT(!ggml_is_transposed(src1)); GGML_ASSERT(!ggml_is_permuted(src0)); - GGML_ASSERT(src0->backend != GGML_BACKEND_GPU_SPLIT); + GGML_ASSERT(src0->backend != GGML_BACKEND_TYPE_GPU_SPLIT); GGML_ASSERT(src0->type == GGML_TYPE_F16); GGML_ASSERT(src1->type == GGML_TYPE_F32); @@ -13196,7 +13196,7 @@ static void ggml_sycl_mul_mat_mat_batched_sycl(const ggml_tensor *src0, GGML_ASSERT(!ggml_is_transposed(src0)); GGML_ASSERT(!ggml_is_transposed(src1)); - GGML_ASSERT(src0->backend != GGML_BACKEND_GPU_SPLIT); + GGML_ASSERT(src0->backend != GGML_BACKEND_TYPE_GPU_SPLIT); GGML_ASSERT(src0->type == GGML_TYPE_F16); GGML_ASSERT(src1->type == GGML_TYPE_F32); @@ -13372,11 +13372,11 @@ catch (sycl::exception const &exc) { static void ggml_sycl_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { const bool all_on_device = - (src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT) && - (src1->backend == GGML_BACKEND_GPU) && - ( dst->backend == GGML_BACKEND_GPU); + (src0->backend == GGML_BACKEND_TYPE_GPU || src0->backend == GGML_BACKEND_TYPE_GPU_SPLIT) && + (src1->backend == GGML_BACKEND_TYPE_GPU) && + ( dst->backend == GGML_BACKEND_TYPE_GPU); - const bool split = src0->backend == GGML_BACKEND_GPU_SPLIT; + const bool split = src0->backend == GGML_BACKEND_TYPE_GPU_SPLIT; int64_t min_compute_capability = INT_MAX; for (int64_t id = 0; id < g_device_count; ++id) { @@ -13505,7 +13505,7 @@ static void ggml_sycl_mul_mat_id_sycl(ggml_tensor * dst) { GGML_ASSERT(!ggml_is_transposed(src00)); GGML_ASSERT(!ggml_is_transposed(src1)); - GGML_ASSERT(src00->backend != GGML_BACKEND_GPU_SPLIT); + GGML_ASSERT(src00->backend != GGML_BACKEND_TYPE_GPU_SPLIT); GGML_ASSERT(src1->type == GGML_TYPE_F32); GGML_TENSOR_LOCALS(int64_t, ne0, src00, ne); @@ -13643,7 +13643,7 @@ static void ggml_sycl_mul_mat_id(const ggml_tensor *src0, const dpct::queue_ptr stream = g_syclStreams[g_main_device_index][0]; - if (ids->backend == GGML_BACKEND_GPU) { + if (ids->backend == GGML_BACKEND_TYPE_GPU) { const char * ids_dev = (const char *)((const ggml_tensor_extra_gpu *)ids->extra)->data_device[g_main_device_index]; SYCL_CHECK(CHECK_TRY_ERROR( stream->memcpy(ids_host.data(), ids_dev, ggml_nbytes(ids)))); @@ -13661,20 +13661,20 @@ static void ggml_sycl_mul_mat_id(const ggml_tensor *src0, ggml_tensor src1_row = *src1; ggml_tensor dst_row = *dst; - src1_row.backend = GGML_BACKEND_GPU; - dst_row.backend = GGML_BACKEND_GPU; + src1_row.backend = GGML_BACKEND_TYPE_GPU; + dst_row.backend = GGML_BACKEND_TYPE_GPU; src1_row.extra = &src1_row_extra; dst_row.extra = &dst_row_extra; - char * src1_original = src1->backend == GGML_BACKEND_CPU ? + char * src1_original = src1->backend == GGML_BACKEND_TYPE_CPU ? (char *) src1->data : (char *) src1_extra->data_device[g_main_device_index]; - char * dst_original = dst->backend == GGML_BACKEND_CPU ? + char * dst_original = dst->backend == GGML_BACKEND_TYPE_CPU ? (char *) dst->data : (char *) dst_extra->data_device[g_main_device_index]; if (src1->ne[1] == 1) { - GGML_ASSERT(src1->backend == GGML_BACKEND_GPU); - GGML_ASSERT(dst->backend == GGML_BACKEND_GPU); + GGML_ASSERT(src1->backend == GGML_BACKEND_TYPE_GPU); + GGML_ASSERT(dst->backend == GGML_BACKEND_TYPE_GPU); for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) { //int32_t row_id; @@ -13756,7 +13756,7 @@ static void ggml_sycl_mul_mat_id(const ggml_tensor *src0, } } - if (dst->backend == GGML_BACKEND_CPU) { + if (dst->backend == GGML_BACKEND_TYPE_CPU) { SYCL_CHECK(CHECK_TRY_ERROR(stream->wait())); } } @@ -13779,8 +13779,8 @@ static void ggml_sycl_cpy(const ggml_tensor *src0, const ggml_tensor *src1, const int64_t ne = ggml_nelements(src0); GGML_ASSERT(ne == ggml_nelements(src1)); - GGML_ASSERT(src0->backend == GGML_BACKEND_GPU); - GGML_ASSERT(src1->backend == GGML_BACKEND_GPU); + GGML_ASSERT(src0->backend == GGML_BACKEND_TYPE_GPU); + GGML_ASSERT(src1->backend == GGML_BACKEND_TYPE_GPU); GGML_ASSERT(ggml_nbytes(src0) <= INT_MAX); GGML_ASSERT(ggml_nbytes(src1) <= INT_MAX); @@ -13887,17 +13887,17 @@ void ggml_sycl_transform_tensor(void *data, struct ggml_tensor *tensor) try { memset(extra, 0, sizeof(*extra)); for (int64_t id = 0; id < g_device_count; ++id) { - if (backend == GGML_BACKEND_GPU && id != g_main_device_index) { + if (backend == GGML_BACKEND_TYPE_GPU && id != g_main_device_index) { continue; } ggml_sycl_set_device(get_device_id_by_index(id)); const dpct::queue_ptr stream = g_syclStreams[id][0]; int64_t row_low, row_high; - if (backend == GGML_BACKEND_GPU) { + if (backend == GGML_BACKEND_TYPE_GPU) { row_low = 0; row_high = nrows; - } else if (backend == GGML_BACKEND_GPU_SPLIT) { + } else if (backend == GGML_BACKEND_TYPE_GPU_SPLIT) { const int64_t rounding = get_row_rounding(tensor->type); row_low = id == 0 ? 0 : nrows*g_tensor_split[id]; @@ -13946,7 +13946,7 @@ void ggml_sycl_transform_tensor(void *data, struct ggml_tensor *tensor) try { extra->data_device[id] = buf; - if (backend == GGML_BACKEND_GPU_SPLIT) { + if (backend == GGML_BACKEND_TYPE_GPU_SPLIT) { for (int64_t is = 0; is < MAX_STREAMS; ++is) { SYCL_CHECK(CHECK_TRY_ERROR(extra->events[id][is] = new sycl::event())); @@ -13963,7 +13963,7 @@ catch (sycl::exception const &exc) { } void ggml_sycl_free_data(struct ggml_tensor *tensor) try { - if (!tensor || !tensor->extra || (tensor->backend != GGML_BACKEND_GPU && tensor->backend != GGML_BACKEND_GPU_SPLIT) ) { + if (!tensor || !tensor->extra || (tensor->backend != GGML_BACKEND_TYPE_GPU && tensor->backend != GGML_BACKEND_TYPE_GPU_SPLIT) ) { return; } @@ -14016,15 +14016,15 @@ static void ggml_sycl_assign_buffers_impl(struct ggml_tensor *tensor, return; } - tensor->backend = GGML_BACKEND_GPU; + tensor->backend = GGML_BACKEND_TYPE_GPU; - if (tensor->src[0] != nullptr && tensor->src[0]->backend == GGML_BACKEND_CPU) { + if (tensor->src[0] != nullptr && tensor->src[0]->backend == GGML_BACKEND_TYPE_CPU) { const ggml_op src0_op = tensor->src[0]->op; if (src0_op == GGML_OP_RESHAPE || src0_op == GGML_OP_TRANSPOSE || src0_op == GGML_OP_VIEW || src0_op == GGML_OP_PERMUTE) { ggml_sycl_assign_buffers_impl(tensor->src[0], scratch, force_inplace, no_alloc); } } - if (tensor->op == GGML_OP_CPY && tensor->src[1]->backend == GGML_BACKEND_CPU) { + if (tensor->op == GGML_OP_CPY && tensor->src[1]->backend == GGML_BACKEND_TYPE_CPU) { ggml_sycl_assign_buffers_impl(tensor->src[1], scratch, force_inplace, no_alloc); } @@ -14042,7 +14042,7 @@ static void ggml_sycl_assign_buffers_impl(struct ggml_tensor *tensor, SYCL_CHECK(ggml_sycl_set_device(g_main_device)); const dpct::queue_ptr stream = g_syclStreams[g_main_device_index][0]; - if (inplace && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT)) { + if (inplace && (tensor->src[0]->backend == GGML_BACKEND_TYPE_GPU || tensor->src[0]->backend == GGML_BACKEND_TYPE_GPU_SPLIT)) { ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src[0]->extra; char * src0_ddc = (char *) src0_extra->data_device[g_main_device_index]; size_t offset = 0; @@ -14111,7 +14111,7 @@ void ggml_sycl_assign_scratch_offset(struct ggml_tensor *tensor, const bool inplace = tensor->view_src != nullptr; - if (inplace && (tensor->view_src->backend == GGML_BACKEND_GPU || tensor->view_src->backend == GGML_BACKEND_GPU_SPLIT)) { + if (inplace && (tensor->view_src->backend == GGML_BACKEND_TYPE_GPU || tensor->view_src->backend == GGML_BACKEND_TYPE_GPU_SPLIT)) { ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->view_src->extra; char * src0_ddc = (char *) src0_extra->data_device[g_main_device_index]; size_t view_offset = 0; @@ -14132,7 +14132,7 @@ catch (sycl::exception const &exc) { } void ggml_sycl_copy_to_device(struct ggml_tensor *tensor) try { - GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU); + GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU); GGML_ASSERT(ggml_is_contiguous(tensor)); ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra; @@ -14219,9 +14219,9 @@ bool ggml_sycl_compute_forward(struct ggml_compute_params * params, struct ggml_ if (!g_sycl_loaded) return false; ggml_sycl_func_t func; - const bool any_on_device = tensor->backend == GGML_BACKEND_GPU - || (tensor->src[0] != nullptr && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT)) - || (tensor->src[1] != nullptr && tensor->src[1]->backend == GGML_BACKEND_GPU); + const bool any_on_device = tensor->backend == GGML_BACKEND_TYPE_GPU + || (tensor->src[0] != nullptr && (tensor->src[0]->backend == GGML_BACKEND_TYPE_GPU || tensor->src[0]->backend == GGML_BACKEND_TYPE_GPU_SPLIT)) + || (tensor->src[1] != nullptr && tensor->src[1]->backend == GGML_BACKEND_TYPE_GPU); if (!any_on_device && tensor->op != GGML_OP_MUL_MAT && tensor->op != GGML_OP_MUL_MAT_ID) { return false; @@ -14359,14 +14359,14 @@ bool ggml_sycl_compute_forward(struct ggml_compute_params * params, struct ggml_ return false; } - if (tensor->src[0] != nullptr && tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT) { + if (tensor->src[0] != nullptr && tensor->src[0]->backend == GGML_BACKEND_TYPE_GPU_SPLIT) { ggml_sycl_set_peer_access(tensor->src[1]->ne[1]); } if (params->ith != 0) { return true; } - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return true; } func(tensor->src[0], tensor->src[1], tensor); @@ -14517,7 +14517,7 @@ static void ggml_backend_sycl_buffer_init_tensor(ggml_backend_buffer_t buffer, extra->data_device[ctx->device] = tensor->data; - tensor->backend = GGML_BACKEND_GPU; + tensor->backend = GGML_BACKEND_TYPE_GPU; tensor->extra = extra; if (ggml_is_quantized(tensor->type)) { @@ -14548,7 +14548,7 @@ static void ggml_backend_sycl_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor *tensor, const void *data, size_t offset, size_t size) try { - GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU); + GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU); ggml_backend_sycl_buffer_context * ctx = ( ggml_backend_sycl_buffer_context *)buffer->context; @@ -14573,7 +14573,7 @@ static void ggml_backend_sycl_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor *tensor, void *data, size_t offset, size_t size) try { - GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU); + GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU); ggml_backend_sycl_buffer_context * ctx = ( ggml_backend_sycl_buffer_context *)buffer->context; @@ -14809,7 +14809,7 @@ static void ggml_backend_sycl_set_tensor_async(ggml_backend_t backend, ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context; GGML_ASSERT(tensor->buffer->buft == ggml_backend_sycl_buffer_type(sycl_ctx->device) && "unsupported buffer type"); - GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU); + GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU); SYCL_CHECK(CHECK_TRY_ERROR(g_syclStreams[sycl_ctx->device][0]->memcpy( (char *)tensor->data + offset, data, size))); @@ -14827,7 +14827,7 @@ static void ggml_backend_sycl_get_tensor_async(ggml_backend_t backend, ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context; GGML_ASSERT(tensor->buffer->buft == ggml_backend_sycl_buffer_type(sycl_ctx->device) && "unsupported buffer type"); - GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU); + GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU); SYCL_CHECK(CHECK_TRY_ERROR(g_syclStreams[sycl_ctx->device][0]->memcpy( data, (const char *)tensor->data + offset, size))); @@ -14880,7 +14880,7 @@ static bool ggml_backend_sycl_graph_compute(ggml_backend_t backend, ggml_cgraph ggml_sycl_set_main_device(sycl_ctx->device); ggml_compute_params params = {}; - params.type = GGML_TASK_COMPUTE; + params.type = GGML_TASK_TYPE_COMPUTE; params.ith = 0; for (int i = 0; i < cgraph->n_nodes; i++) { ggml_tensor * node = cgraph->nodes[i]; @@ -14888,13 +14888,13 @@ static bool ggml_backend_sycl_graph_compute(ggml_backend_t backend, ggml_cgraph if (node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE) continue; - assert(node->backend == GGML_BACKEND_GPU); + assert(node->backend == GGML_BACKEND_TYPE_GPU); assert(node->buffer->buft == ggml_backend_sycl_buffer_type(sycl_ctx->device)); assert(node->extra != nullptr); for (int j = 0; j < GGML_MAX_SRC; j++) { if (node->src[j] != nullptr) { - assert(node->src[j]->backend == GGML_BACKEND_GPU); + assert(node->src[j]->backend == GGML_BACKEND_TYPE_GPU); assert(node->src[j]->buffer->buft == ggml_backend_sycl_buffer_type(sycl_ctx->device)); assert(node->src[j]->extra != nullptr); } diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp index 4e5eaff15110b..6caafb82279ae 100644 --- a/ggml-vulkan.cpp +++ b/ggml-vulkan.cpp @@ -2320,8 +2320,8 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context * su src1_uma = d_Qy != nullptr; } - const bool load_x = src0->backend != GGML_BACKEND_GPU && !src0_uma; - const bool load_y = src1->backend != GGML_BACKEND_GPU && !src1_uma; + const bool load_x = src0->backend != GGML_BACKEND_TYPE_GPU && !src0_uma; + const bool load_y = src1->backend != GGML_BACKEND_TYPE_GPU && !src1_uma; const bool x_non_contig = !load_x && !ggml_vk_dim01_contiguous(src0); const bool y_non_contig = !load_y && !ggml_vk_dim01_contiguous(src1); @@ -2453,7 +2453,7 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context * su // compute ggml_vk_matmul(ctx, subctx, *pipeline, { d_X, x_buf_offset, x_sz * ne02 * ne03 }, { d_Y, y_buf_offset, y_sz * ne12 * ne13 }, { d_D, d_buf_offset, d_sz * ne12 * ne13 }, { ctx->prealloc_split_k, 0, d_sz * ne12 * ne13 * split_k }, ne01, ne11, ne10, ne10, ne10, ne01, split_k, ne12*ne13, ne02, ne12, r2, r3, stride_batch_x, stride_batch_y, ne20*ne21); // NOLINT - if (dst->backend == GGML_BACKEND_CPU) { + if (dst->backend == GGML_BACKEND_TYPE_CPU) { // copy dst to host float * d = (float *) ((char *) dst->data); ggml_vk_buffer_read_async(ctx, subctx, d_D, 0, d, sizeof(float) * d_ne * ne12 * ne13); @@ -2506,8 +2506,8 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context src1_uma = d_Qy != nullptr; } - const bool load_x = src0->backend != GGML_BACKEND_GPU && !src0_uma; - const bool load_y = src1->backend != GGML_BACKEND_GPU && !src1_uma; + const bool load_x = src0->backend != GGML_BACKEND_TYPE_GPU && !src0_uma; + const bool load_y = src1->backend != GGML_BACKEND_TYPE_GPU && !src1_uma; const bool x_non_contig = !load_x && !ggml_vk_dim01_contiguous(src0); const bool y_non_contig = !load_y && !ggml_vk_dim01_contiguous(src1); @@ -2630,7 +2630,7 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context ggml_vk_sync_buffers(subctx); ggml_vk_dispatch_pipeline(ctx, subctx, *dmmv, { { d_X, x_offset, x_sz }, { d_Y, y_buffer_offset, y_sz + y_shader_offset }, { d_D, d_buffer_offset, d_sz + d_shader_offset } }, 3 * sizeof(int), &pc, { (uint32_t)ne01, 1, 1}); - if (dst->backend == GGML_BACKEND_CPU) { + if (dst->backend == GGML_BACKEND_TYPE_CPU) { // copy dst to host float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3); ggml_vk_sync_buffers(subctx); @@ -2647,7 +2647,7 @@ static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_c std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", backend=" << dst->backend << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "),)" << std::endl; #endif GGML_ASSERT(ggml_is_permuted(src0) && ggml_is_permuted(src1)); - GGML_ASSERT(src0->backend == GGML_BACKEND_GPU); + GGML_ASSERT(src0->backend == GGML_BACKEND_TYPE_GPU); GGML_ASSERT(src0->nb[0] <= src0->nb[1] && src0->nb[2] <= src0->nb[3]); // NOLINT GGML_ASSERT(src1->nb[0] <= src1->nb[1] && src1->nb[2] <= src1->nb[3]); // NOLINT GGML_ASSERT(src0->type == GGML_TYPE_F16); @@ -2679,7 +2679,7 @@ static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_c src1_uma = d_Qy != nullptr; } - const bool load_y = src1->backend != GGML_BACKEND_GPU && !src1_uma; + const bool load_y = src1->backend != GGML_BACKEND_TYPE_GPU && !src1_uma; const uint64_t x_ne = ne00 * ne01 * ne02; const uint64_t y_ne = ne10 * ne11 * ne12; @@ -2721,7 +2721,7 @@ static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_c ggml_vk_sync_buffers(subctx); ggml_vk_dispatch_pipeline(ctx, subctx, ctx->pipeline_mul_mat_vec_p021_f16_f32, { { d_Qx, qx_buf_offset, qx_sz }, { d_Qy, qy_buffer_offset, qy_sz + qy_shader_offset }, { d_D, d_buffer_offset, d_sz + d_shader_offset } }, 6 * sizeof(uint32_t), &pc, { 1, (uint32_t)ne01, (uint32_t)ne12 }); - if (dst->backend == GGML_BACKEND_CPU) { + if (dst->backend == GGML_BACKEND_TYPE_CPU) { // copy dst to host float * d = (float *) dst->data; ggml_vk_sync_buffers(subctx); @@ -2738,7 +2738,7 @@ static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_con GGML_ASSERT(!ggml_is_transposed(src0)); GGML_ASSERT(!ggml_is_transposed(src1)); GGML_ASSERT(!ggml_is_permuted(src0)); - GGML_ASSERT(src0->backend == GGML_BACKEND_GPU); + GGML_ASSERT(src0->backend == GGML_BACKEND_TYPE_GPU); GGML_ASSERT(src0->type == GGML_TYPE_F16); GGML_ASSERT(src1->type == GGML_TYPE_F32); @@ -2771,7 +2771,7 @@ static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_con src1_uma = d_Qy != nullptr; } - const bool load_y = src1->backend != GGML_BACKEND_GPU && !src1_uma; + const bool load_y = src1->backend != GGML_BACKEND_TYPE_GPU && !src1_uma; const uint64_t d_ne = ne01 * ne11 * ne12; @@ -2814,7 +2814,7 @@ static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_con ggml_vk_sync_buffers(subctx); ggml_vk_dispatch_pipeline(ctx, subctx, ctx->pipeline_mul_mat_vec_nc_f16_f32, { { d_Qx, qx_buf_offset, qx_sz }, { d_Qy, qy_buffer_offset, qy_sz + qy_shader_offset }, { d_D, d_buffer_offset, d_sz + d_shader_offset } }, 7 * sizeof(uint32_t), &pc, { 1, (uint32_t)ne01, (uint32_t)ne12 }); - if (dst->backend == GGML_BACKEND_CPU) { + if (dst->backend == GGML_BACKEND_TYPE_CPU) { // copy dst to host float * d = (float *) dst->data; ggml_vk_sync_buffers(subctx); @@ -2832,7 +2832,7 @@ static bool ggml_vk_can_mul_mat(const ggml_tensor * src0, const ggml_tensor * sr return (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) && (src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16 || ggml_is_quantized(src1->type)) && dst->type == GGML_TYPE_F32 && - ((ne0 >= 32 && ne1 >= 32 && ne10 >= 32) || src0->backend == GGML_BACKEND_GPU); + ((ne0 >= 32 && ne1 >= 32 && ne10 >= 32) || src0->backend == GGML_BACKEND_TYPE_GPU); } static void ggml_vk_mul_mat(ggml_backend_vk_context * ctx, vk_context * subctx, const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) { @@ -2880,8 +2880,8 @@ static void ggml_vk_op_repeat(ggml_backend_vk_context * ctx, vk_context * subctx // TODO: support for transposed / permuted tensors GGML_ASSERT(nb0 == sizeof(float)); GGML_ASSERT(nb00 == sizeof(float)); - GGML_ASSERT(src0->backend == GGML_BACKEND_GPU); - GGML_ASSERT(dst->backend == GGML_BACKEND_GPU); + GGML_ASSERT(src0->backend == GGML_BACKEND_TYPE_GPU); + GGML_ASSERT(dst->backend == GGML_BACKEND_TYPE_GPU); ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) dst->extra; ggml_tensor_extra_gpu * extra_src0 = (ggml_tensor_extra_gpu *) src0->extra; @@ -3110,8 +3110,8 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c } } - const bool transfer_src0 = src0->backend != GGML_BACKEND_GPU && !src0_uma; - const bool transfer_src1 = use_src1 && src1->backend != GGML_BACKEND_GPU && !src1_uma; + const bool transfer_src0 = src0->backend != GGML_BACKEND_TYPE_GPU && !src0_uma; + const bool transfer_src1 = use_src1 && src1->backend != GGML_BACKEND_TYPE_GPU && !src1_uma; uint64_t x_sz = ggml_vk_align_size(ggml_type_size(src0->type) * ne0, ctx->device.lock()->properties.limits.minStorageBufferOffsetAlignment); uint64_t y_sz = use_src1 ? ggml_vk_align_size(ggml_type_size(src1->type) * ne1, ctx->device.lock()->properties.limits.minStorageBufferOffsetAlignment) : 0; @@ -3120,7 +3120,7 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c vk_buffer d_D = extra->buffer_gpu.lock(); // Workaround for tiny tensor inputs on ROPE - if (use_src1 && src1->backend == GGML_BACKEND_GPU && y_sz > d_D->size) { + if (use_src1 && src1->backend == GGML_BACKEND_TYPE_GPU && y_sz > d_D->size) { y_sz = VK_WHOLE_SIZE; } @@ -3209,9 +3209,9 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c ggml_vk_sync_buffers(subctx); ggml_vk_dispatch_pipeline(ctx, subctx, *pipeline, { { d_X, x_buf_offset, x_sz }, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements); } - if (dst->backend == GGML_BACKEND_CPU && op == GGML_OP_CPY) { + if (dst->backend == GGML_BACKEND_TYPE_CPU && op == GGML_OP_CPY) { ggml_vk_d2h_tensor_2d(ctx, subctx, d_D, 0, dst); - } else if(dst->backend == GGML_BACKEND_CPU) { + } else if(dst->backend == GGML_BACKEND_TYPE_CPU) { // copy dst to host float * d = (float *) dst->data; ggml_vk_buffer_read_async(ctx, subctx, d_D, 0, d, d_sz); @@ -3253,7 +3253,7 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c ggml_vk_sync_buffers(subctx); ggml_vk_dispatch_pipeline(ctx, subctx, *pipeline, { { d_X, x_buf_offset + x_offset, x_sz }, { d_D, d_buf_offset + d_offset, d_sz } }, sizeof(PC), &pc, elements); } - if (dst->backend == GGML_BACKEND_CPU) { + if (dst->backend == GGML_BACKEND_TYPE_CPU) { // copy dst to host ggml_vk_buffer_read_async(ctx, subctx, d_D, d_buf_offset + d_offset, (char *) dst->data + i02*nb2 + i03*nb3, d_sz); } @@ -3359,7 +3359,7 @@ static void ggml_vk_rope(ggml_backend_vk_context * ctx, vk_context * subctx, con static void ggml_vk_nop(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, ggml_tensor * dst) { // If backend is CPU, data from src0 has to be copied off the device - if (dst->backend == GGML_BACKEND_CPU) { + if (dst->backend == GGML_BACKEND_TYPE_CPU) { ggml_tensor_extra_gpu * extra_src0 = (ggml_tensor_extra_gpu *) src0->extra; vk_buffer d_D = extra_src0->buffer_gpu.lock(); ggml_vk_sync_buffers(subctx); @@ -3994,9 +3994,9 @@ static void ggml_vk_preallocate_buffers_graph(ggml_backend_vk_context * ctx, ggm #ifdef GGML_VULKAN_DEBUG std::cerr << "ggml_vk_preallocate_buffers_graph(" << node << ")" << std::endl; #endif - const bool any_on_device = node->backend == GGML_BACKEND_GPU - || (node->src[0] != nullptr && (node->src[0]->backend == GGML_BACKEND_GPU || node->src[0]->backend == GGML_BACKEND_GPU_SPLIT)) - || (node->src[1] != nullptr && (node->src[1]->backend == GGML_BACKEND_GPU)); + const bool any_on_device = node->backend == GGML_BACKEND_TYPE_GPU + || (node->src[0] != nullptr && (node->src[0]->backend == GGML_BACKEND_TYPE_GPU || node->src[0]->backend == GGML_BACKEND_TYPE_GPU_SPLIT)) + || (node->src[1] != nullptr && (node->src[1]->backend == GGML_BACKEND_TYPE_GPU)); if (ctx->disable || (!any_on_device && node->op != GGML_OP_MUL_MAT)) { return; @@ -4215,9 +4215,9 @@ static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) { } static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * node, bool last_node){ - const bool any_on_device = node->backend == GGML_BACKEND_GPU - || (node->src[0] != nullptr && (node->src[0]->backend == GGML_BACKEND_GPU || node->src[0]->backend == GGML_BACKEND_GPU_SPLIT)) - || (node->src[1] != nullptr && node->src[1]->backend == GGML_BACKEND_GPU); + const bool any_on_device = node->backend == GGML_BACKEND_TYPE_GPU + || (node->src[0] != nullptr && (node->src[0]->backend == GGML_BACKEND_TYPE_GPU || node->src[0]->backend == GGML_BACKEND_TYPE_GPU_SPLIT)) + || (node->src[1] != nullptr && node->src[1]->backend == GGML_BACKEND_TYPE_GPU); if (ctx->disable || (!any_on_device && node->op != GGML_OP_MUL_MAT) || (node->op == GGML_OP_MUL_MAT && !any_on_device && !ggml_vk_can_mul_mat(node->src[0], node->src[1], node))) { return; @@ -4371,7 +4371,7 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod last_node = true; #endif - if (node->backend == GGML_BACKEND_CPU || last_node) { + if (node->backend == GGML_BACKEND_TYPE_CPU || last_node) { ggml_vk_ctx_end(ctx->compute_ctx); ctx->compute_ctx->exit_tensor = node; ctx->compute_ctx = nullptr; @@ -4379,9 +4379,9 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod } static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_compute_params * params, ggml_tensor * tensor){ - const bool any_on_device = tensor->backend == GGML_BACKEND_GPU - || (tensor->src[0] != nullptr && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT)) - || (tensor->src[1] != nullptr && tensor->src[1]->backend == GGML_BACKEND_GPU); + const bool any_on_device = tensor->backend == GGML_BACKEND_TYPE_GPU + || (tensor->src[0] != nullptr && (tensor->src[0]->backend == GGML_BACKEND_TYPE_GPU || tensor->src[0]->backend == GGML_BACKEND_TYPE_GPU_SPLIT)) + || (tensor->src[1] != nullptr && tensor->src[1]->backend == GGML_BACKEND_TYPE_GPU); if (ctx->disable || (!any_on_device && tensor->op != GGML_OP_MUL_MAT)) { return false; @@ -4442,7 +4442,7 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_compute_ if (params->ith != 0) { return true; } - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return true; } @@ -4745,7 +4745,7 @@ GGML_CALL static void ggml_backend_vk_buffer_init_tensor(ggml_backend_buffer_t b extra->offset = (uint8_t *) tensor->data - (uint8_t *) vk_ptr_base; } - tensor->backend = GGML_BACKEND_GPU; + tensor->backend = GGML_BACKEND_TYPE_GPU; tensor->extra = extra; } @@ -4753,7 +4753,7 @@ GGML_CALL static void ggml_backend_vk_buffer_set_tensor(ggml_backend_buffer_t bu #ifdef GGML_VULKAN_DEBUG std::cerr << "ggml_backend_vk_buffer_set_tensor(" << buffer << ", " << tensor << ", " << data << ", " << offset << ", " << size << ")" << std::endl; #endif - GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU); + GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU); ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context; @@ -4768,7 +4768,7 @@ GGML_CALL static void ggml_backend_vk_buffer_get_tensor(ggml_backend_buffer_t bu #ifdef GGML_VULKAN_DEBUG std::cerr << "ggml_backend_vk_buffer_get_tensor(" << buffer << ", " << tensor << ", " << data << ", " << offset << ", " << size << ")" << std::endl; #endif - GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU); + GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU); ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context; @@ -4999,7 +4999,7 @@ GGML_CALL static void ggml_backend_vk_set_tensor_async(ggml_backend_t backend, g #endif ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context; GGML_ASSERT((tensor->buffer->buft == ggml_backend_vk_buffer_type(ctx->idx) || tensor->buffer->buft == ggml_backend_vk_host_buffer_type()) && "unsupported buffer type"); - GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU); + GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU); ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra; @@ -5020,7 +5020,7 @@ GGML_CALL static void ggml_backend_vk_get_tensor_async(ggml_backend_t backend, c #endif ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context; GGML_ASSERT((tensor->buffer->buft == ggml_backend_vk_buffer_type(ctx->idx) || tensor->buffer->buft == ggml_backend_vk_host_buffer_type()) && "unsupported buffer type"); - GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU); + GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU); ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra; @@ -5097,7 +5097,7 @@ GGML_CALL static bool ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml int last_node = cgraph->n_nodes - 1; // If the last op in the cgraph isn't backend GPU, the command buffer doesn't get closed properly - while (last_node > 0 && cgraph->nodes[last_node]->backend != GGML_BACKEND_GPU) { + while (last_node > 0 && cgraph->nodes[last_node]->backend != GGML_BACKEND_TYPE_GPU) { last_node -= 1; } @@ -5106,7 +5106,7 @@ GGML_CALL static bool ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml } ggml_compute_params params = {}; - params.type = GGML_TASK_COMPUTE; + params.type = GGML_TASK_TYPE_COMPUTE; params.ith = 0; for (int i = 0; i < cgraph->n_nodes; i++) { ggml_tensor * node = cgraph->nodes[i]; @@ -5410,7 +5410,7 @@ static void ggml_vk_print_tensor_area(const ggml_tensor * tensor, const void * d static void ggml_vk_print_tensor(ggml_backend_vk_context * ctx, const ggml_tensor * tensor, const char * name) { void * tensor_data = tensor->data; - if (tensor->backend == GGML_BACKEND_GPU) { + if (tensor->backend == GGML_BACKEND_TYPE_GPU) { const size_t tensor_size = ggml_nbytes(tensor); tensor_data = malloc(tensor_size); @@ -5436,14 +5436,14 @@ static void ggml_vk_print_tensor(ggml_backend_vk_context * ctx, const ggml_tenso std::vector done; ggml_vk_print_graph_origin(tensor, done); - if (tensor->backend == GGML_BACKEND_GPU) { + if (tensor->backend == GGML_BACKEND_TYPE_GPU) { free(tensor_data); } } static void ggml_vk_check_tensor(const std::string& name, const ggml_tensor * tensor) { return; - GGML_ASSERT(tensor->backend == GGML_BACKEND_CPU); + GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_CPU); if (tensor->type != GGML_TYPE_F32 && tensor->type != GGML_TYPE_F16) { return; } @@ -5481,7 +5481,7 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_ if (params->ith != 0) { return; } - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE || tensor->op == GGML_OP_TRANSPOSE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE || tensor->op == GGML_OP_TRANSPOSE) { return; } @@ -5518,10 +5518,10 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_ src0_buffer = malloc(src0_size); src0_clone->data = src0_buffer; - if (src0->backend == GGML_BACKEND_CPU) { + if (src0->backend == GGML_BACKEND_TYPE_CPU) { memcpy(src0_clone->data, src0->data, src0_size); memcpy(src0_clone->nb, src0->nb, sizeof(size_t) * GGML_MAX_DIMS); - } else if (src0->backend == GGML_BACKEND_GPU) { + } else if (src0->backend == GGML_BACKEND_TYPE_GPU) { ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src0->extra; uint64_t offset = extra->offset; if (!ggml_is_contiguous(src0) && ggml_vk_dim01_contiguous(src0)) { @@ -5561,10 +5561,10 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_ src1_buffer = malloc(src1_size); src1_clone->data = src1_buffer; - if (src1->backend == GGML_BACKEND_CPU) { + if (src1->backend == GGML_BACKEND_TYPE_CPU) { memcpy(src1_clone->data, src1->data, src1_size); memcpy(src1_clone->nb, src1->nb, sizeof(size_t) * GGML_MAX_DIMS); - } else if (src1->backend == GGML_BACKEND_GPU) { + } else if (src1->backend == GGML_BACKEND_TYPE_GPU) { ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src1->extra; uint64_t offset = extra->offset; if (!ggml_is_contiguous(src1) && ggml_vk_dim01_contiguous(src1)) { @@ -5723,7 +5723,7 @@ static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_compute_ if (params->ith != 0) { return; } - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE || tensor->op == GGML_OP_TRANSPOSE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE || tensor->op == GGML_OP_TRANSPOSE) { return; } if (!(vk_output_tensor > 0 && vk_output_tensor == check_counter) && check_counter <= vk_skip_checks) { @@ -5735,7 +5735,7 @@ static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_compute_ void * tensor_data = tensor->data; - if (tensor->backend == GGML_BACKEND_GPU) { + if (tensor->backend == GGML_BACKEND_TYPE_GPU) { size_t tensor_size = ggml_nbytes(tensor); tensor_data = malloc(tensor_size); @@ -5868,7 +5868,7 @@ static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_compute_ comp_result = nullptr; comp_size = 0; - if (tensor->backend == GGML_BACKEND_GPU) { + if (tensor->backend == GGML_BACKEND_TYPE_GPU) { free(tensor_data); } } diff --git a/ggml.c b/ggml.c index c09a3cad657f2..1d81553f47106 100644 --- a/ggml.c +++ b/ggml.c @@ -2721,7 +2721,7 @@ static struct ggml_tensor * ggml_new_tensor_impl( } } - struct ggml_object * const obj_new = ggml_new_object(ctx, GGML_OBJECT_TENSOR, GGML_TENSOR_SIZE + obj_alloc_size); + struct ggml_object * const obj_new = ggml_new_object(ctx, GGML_OBJECT_TYPE_TENSOR, GGML_TENSOR_SIZE + obj_alloc_size); // TODO: for recoverable errors, we would need to free the data allocated from the scratch buffer here @@ -2729,7 +2729,7 @@ static struct ggml_tensor * ggml_new_tensor_impl( *result = (struct ggml_tensor) { /*.type =*/ type, - /*.backend =*/ GGML_BACKEND_CPU, + /*.backend =*/ GGML_BACKEND_TYPE_CPU, /*.buffer =*/ NULL, /*.ne =*/ { 1, 1, 1, 1 }, /*.nb =*/ { 0, 0, 0, 0 }, @@ -3302,7 +3302,7 @@ struct ggml_tensor * ggml_get_first_tensor(const struct ggml_context * ctx) { char * const mem_buffer = ctx->mem_buffer; while (obj != NULL) { - if (obj->type == GGML_OBJECT_TENSOR) { + if (obj->type == GGML_OBJECT_TYPE_TENSOR) { return (struct ggml_tensor *)(mem_buffer + obj->offs); } @@ -3319,7 +3319,7 @@ struct ggml_tensor * ggml_get_next_tensor(const struct ggml_context * ctx, struc char * const mem_buffer = ctx->mem_buffer; while (obj != NULL) { - if (obj->type == GGML_OBJECT_TENSOR) { + if (obj->type == GGML_OBJECT_TYPE_TENSOR) { return (struct ggml_tensor *)(mem_buffer + obj->offs); } @@ -3335,7 +3335,7 @@ struct ggml_tensor * ggml_get_tensor(struct ggml_context * ctx, const char * nam char * const mem_buffer = ctx->mem_buffer; while (obj != NULL) { - if (obj->type == GGML_OBJECT_TENSOR) { + if (obj->type == GGML_OBJECT_TYPE_TENSOR) { struct ggml_tensor * cur = (struct ggml_tensor *)(mem_buffer + obj->offs); if (strcmp(cur->name, name) == 0) { return cur; @@ -5879,7 +5879,7 @@ struct ggml_tensor * ggml_top_k( int k) { GGML_ASSERT(a->ne[0] >= k); - struct ggml_tensor * result = ggml_argsort(ctx, a, GGML_SORT_DESC); + struct ggml_tensor * result = ggml_argsort(ctx, a, GGML_SORT_ORDER_DESC); result = ggml_view_4d(ctx, result, k, result->ne[1], result->ne[2], result->ne[3], @@ -6673,7 +6673,7 @@ static void ggml_compute_forward_dup_same_cont( GGML_ASSERT(ggml_is_contiguous(dst) && ggml_is_contiguous(src0)); GGML_ASSERT(src0->type == dst->type); - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -6705,7 +6705,7 @@ static void ggml_compute_forward_dup_f16( GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0)); - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -6978,7 +6978,7 @@ static void ggml_compute_forward_dup_f32( GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0)); - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -7231,7 +7231,7 @@ static void ggml_compute_forward_dup_bytes( GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0)); GGML_ASSERT(src0->type == dst->type); - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -7411,7 +7411,7 @@ static void ggml_compute_forward_add_f32( GGML_ASSERT(ggml_can_repeat(src1, src0) && ggml_are_same_shape(src0, dst)); - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -7419,7 +7419,7 @@ static void ggml_compute_forward_add_f32( const int nth = params->nth; #ifdef GGML_USE_CLBLAST - if (src1->backend == GGML_BACKEND_GPU) { + if (src1->backend == GGML_BACKEND_TYPE_GPU) { // TODO: OpenCL kernel support full broadcast GGML_ASSERT(ggml_can_repeat_rows(src1, src0)); if (ith == 0) { @@ -7501,7 +7501,7 @@ static void ggml_compute_forward_add_f16_f32( GGML_ASSERT(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst)); - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -7580,7 +7580,7 @@ static void ggml_compute_forward_add_f16_f16( GGML_ASSERT(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst)); - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -7636,7 +7636,7 @@ static void ggml_compute_forward_add_q_f32( GGML_ASSERT(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst)); - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -7774,7 +7774,7 @@ static void ggml_compute_forward_add1_f32( GGML_ASSERT(ggml_are_same_shape(src0, dst)); GGML_ASSERT(ggml_is_scalar(src1)); - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -7828,7 +7828,7 @@ static void ggml_compute_forward_add1_f16_f32( GGML_ASSERT(ggml_are_same_shape(src0, dst)); GGML_ASSERT(ggml_is_scalar(src1)); - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -7880,7 +7880,7 @@ static void ggml_compute_forward_add1_f16_f16( GGML_ASSERT(ggml_are_same_shape(src0, dst)); GGML_ASSERT(ggml_is_scalar(src1)); - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -7932,7 +7932,7 @@ static void ggml_compute_forward_add1_q_f32( GGML_ASSERT(ggml_are_same_shape(src0, dst)); GGML_ASSERT(ggml_is_scalar(src1)); - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -8062,7 +8062,7 @@ static void ggml_compute_forward_acc_f32( size_t offset = ((int32_t *) dst->op_params)[3]; bool inplace = (bool) ((int32_t *) dst->op_params)[4]; - if (!inplace && (params->type == GGML_TASK_INIT)) { + if (!inplace && (params->type == GGML_TASK_TYPE_INIT)) { if (params->ith != 0) { return; } @@ -8074,7 +8074,7 @@ static void ggml_compute_forward_acc_f32( ggml_nbytes(dst)); } - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -8176,7 +8176,7 @@ static void ggml_compute_forward_sub_f32( assert(params->ith == 0); assert(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst)); - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -8257,14 +8257,14 @@ static void ggml_compute_forward_mul_f32( GGML_ASSERT(ggml_can_repeat(src1, src0) && ggml_are_same_shape(src0, dst)); - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } const int ith = params->ith; const int nth = params->nth; #if defined(GGML_USE_CLBLAST) - if (src1->backend == GGML_BACKEND_GPU) { + if (src1->backend == GGML_BACKEND_TYPE_GPU) { // TODO: OpenCL kernel support full broadcast GGML_ASSERT(ggml_can_repeat_rows(src1, src0)); if (ith == 0) { @@ -8365,7 +8365,7 @@ static void ggml_compute_forward_div_f32( GGML_ASSERT(ggml_can_repeat(src1, src0) && ggml_are_same_shape(src0, dst)); - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -8460,7 +8460,7 @@ static void ggml_compute_forward_sqr_f32( assert(params->ith == 0); assert(ggml_are_same_shape(src0, dst)); - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -8506,7 +8506,7 @@ static void ggml_compute_forward_sqrt_f32( assert(params->ith == 0); assert(ggml_are_same_shape(src0, dst)); - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -8552,7 +8552,7 @@ static void ggml_compute_forward_log_f32( GGML_ASSERT(params->ith == 0); GGML_ASSERT(ggml_are_same_shape(src0, dst)); - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -8598,7 +8598,7 @@ static void ggml_compute_forward_sum_f32( assert(params->ith == 0); assert(ggml_is_scalar(dst)); - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -8633,7 +8633,7 @@ static void ggml_compute_forward_sum_f16( assert(params->ith == 0); assert(ggml_is_scalar(dst)); - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -8690,7 +8690,7 @@ static void ggml_compute_forward_sum_rows_f32( GGML_ASSERT(params->ith == 0); - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -8745,7 +8745,7 @@ static void ggml_compute_forward_mean_f32( assert(params->ith == 0); - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -8804,7 +8804,7 @@ static void ggml_compute_forward_argmax_f32( assert(params->ith == 0); - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -8855,7 +8855,7 @@ static void ggml_compute_forward_repeat_f32( GGML_ASSERT(params->ith == 0); GGML_ASSERT(ggml_can_repeat(src0, dst)); - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -8900,7 +8900,7 @@ static void ggml_compute_forward_repeat_f16( GGML_ASSERT(params->ith == 0); GGML_ASSERT(ggml_can_repeat(src0, dst)); - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -8974,7 +8974,7 @@ static void ggml_compute_forward_repeat_back_f32( GGML_ASSERT(params->ith == 0); GGML_ASSERT(ggml_can_repeat(dst, src0)); - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -9051,7 +9051,7 @@ static void ggml_compute_forward_concat_f32( const struct ggml_tensor * src0 = dst->src[0]; const struct ggml_tensor * src1 = dst->src[1]; - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -9123,7 +9123,7 @@ static void ggml_compute_forward_abs_f32( assert(params->ith == 0); assert(ggml_are_same_shape(src0, dst)); - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -9169,7 +9169,7 @@ static void ggml_compute_forward_sgn_f32( assert(params->ith == 0); assert(ggml_are_same_shape(src0, dst)); - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -9215,7 +9215,7 @@ static void ggml_compute_forward_neg_f32( assert(params->ith == 0); assert(ggml_are_same_shape(src0, dst)); - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -9261,7 +9261,7 @@ static void ggml_compute_forward_step_f32( assert(params->ith == 0); assert(ggml_are_same_shape(src0, dst)); - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -9307,7 +9307,7 @@ static void ggml_compute_forward_tanh_f32( assert(params->ith == 0); assert(ggml_are_same_shape(src0, dst)); - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -9353,7 +9353,7 @@ static void ggml_compute_forward_elu_f32( assert(params->ith == 0); assert(ggml_are_same_shape(src0, dst)); - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -9399,7 +9399,7 @@ static void ggml_compute_forward_relu_f32( assert(params->ith == 0); assert(ggml_are_same_shape(src0, dst)); - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -9446,7 +9446,7 @@ static void ggml_compute_forward_gelu_f32( GGML_ASSERT(ggml_is_contiguous_except_dim_1(dst)); GGML_ASSERT(ggml_are_same_shape(src0, dst)); - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -9509,7 +9509,7 @@ static void ggml_compute_forward_gelu_quick_f32( GGML_ASSERT(ggml_is_contiguous_except_dim_1(dst)); GGML_ASSERT(ggml_are_same_shape(src0, dst)); - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -9572,7 +9572,7 @@ static void ggml_compute_forward_silu_f32( GGML_ASSERT(ggml_is_contiguous_except_dim_1(dst)); GGML_ASSERT(ggml_are_same_shape(src0, dst)); - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -9633,7 +9633,7 @@ static void ggml_compute_forward_leaky_relu_f32( assert(params->ith == 0); assert(ggml_are_same_shape(src0, dst)); - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -9686,7 +9686,7 @@ static void ggml_compute_forward_silu_back_f32( GGML_ASSERT(ggml_are_same_shape(src0, dst)); GGML_ASSERT(ggml_are_same_shape(src0, grad)); - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -9748,7 +9748,7 @@ static void ggml_compute_forward_hardswish_f32( assert(params->ith == 0); assert(ggml_are_same_shape(src0, dst)); - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -9791,7 +9791,7 @@ static void ggml_compute_forward_hardsigmoid_f32( assert(params->ith == 0); assert(ggml_are_same_shape(src0, dst)); - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -9837,7 +9837,7 @@ static void ggml_compute_forward_norm_f32( GGML_ASSERT(ggml_are_same_shape(src0, dst)); - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -9912,7 +9912,7 @@ static void ggml_compute_forward_rms_norm_f32( GGML_ASSERT(ggml_are_same_shape(src0, dst)); - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -9983,7 +9983,7 @@ static void ggml_compute_forward_rms_norm_back_f32( GGML_ASSERT(ggml_are_same_shape(src0, dst) && ggml_are_same_shape(src0, src1)); - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -10161,7 +10161,7 @@ static void ggml_compute_forward_group_norm_f32( GGML_ASSERT(ggml_are_same_shape(src0, dst)); - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -10328,7 +10328,7 @@ static void ggml_compute_forward_mul_mat( #if defined(GGML_USE_CLBLAST) if (ggml_cl_can_mul_mat(src0, src1, dst)) { - if (params->ith == 0 && params->type == GGML_TASK_COMPUTE) { + if (params->ith == 0 && params->type == GGML_TASK_TYPE_COMPUTE) { ggml_cl_mul_mat(src0, src1, dst, params->wdata, params->wsize); } return; @@ -10341,7 +10341,7 @@ static void ggml_compute_forward_mul_mat( const size_t desired_wsize = ne13*ne12*ne_plane*sizeof(float); UNUSED(desired_wsize); - if (params->type == GGML_TASK_INIT) { + if (params->type == GGML_TASK_TYPE_INIT) { if (type != GGML_TYPE_F32) { assert(params->wsize >= desired_wsize); // parallelize by src0 rows @@ -10364,7 +10364,7 @@ static void ggml_compute_forward_mul_mat( return; } - if (params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -10402,7 +10402,7 @@ static void ggml_compute_forward_mul_mat( } #endif - if (params->type == GGML_TASK_INIT) { + if (params->type == GGML_TASK_TYPE_INIT) { if (ith != 0) { return; } @@ -10426,7 +10426,7 @@ static void ggml_compute_forward_mul_mat( return; } - if (params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -10583,7 +10583,7 @@ static void ggml_compute_forward_mul_mat_id( #define MMID_MATRIX_ROW(row_id, i1) matrix_rows[(row_id)*ne11 + (i1)] - if (params->type == GGML_TASK_INIT) { + if (params->type == GGML_TASK_TYPE_INIT) { if (ith != 0) { return; } @@ -10620,7 +10620,7 @@ static void ggml_compute_forward_mul_mat_id( return; } - if (params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -10768,7 +10768,7 @@ static void ggml_compute_forward_out_prod_f32( (ggml_is_contiguous(src1) || ggml_is_transposed(src1)); #endif - if (params->type == GGML_TASK_INIT) { + if (params->type == GGML_TASK_TYPE_INIT) { #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) // gemm beta will zero dst if (use_blas) { return; @@ -10781,7 +10781,7 @@ static void ggml_compute_forward_out_prod_f32( return; } - if (params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -10961,7 +10961,7 @@ static void ggml_compute_forward_out_prod_q_f32( // TODO: #if defined(GGML_USE_CUBLAS) ggml_cuda_out_prod // TODO: #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST) - if (params->type == GGML_TASK_INIT) { + if (params->type == GGML_TASK_TYPE_INIT) { if (ith != 0) { return; } @@ -10969,7 +10969,7 @@ static void ggml_compute_forward_out_prod_q_f32( return; } - if (params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -11087,7 +11087,7 @@ static void ggml_compute_forward_scale_f32( GGML_ASSERT(ggml_is_contiguous(dst)); GGML_ASSERT(ggml_are_same_shape(src0, dst)); - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -11159,7 +11159,7 @@ static void ggml_compute_forward_set_f32( size_t offset = ((int32_t *) dst->op_params)[3]; bool inplace = (bool) ((int32_t *) dst->op_params)[4]; - if (!inplace && (params->type == GGML_TASK_INIT)) { + if (!inplace && (params->type == GGML_TASK_TYPE_INIT)) { if (params->ith != 0) { return; } @@ -11171,7 +11171,7 @@ static void ggml_compute_forward_set_f32( ggml_nbytes(dst)); } - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -11319,7 +11319,7 @@ static void ggml_compute_forward_get_rows_q( assert(params->ith == 0); - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -11359,7 +11359,7 @@ static void ggml_compute_forward_get_rows_f16( assert(params->ith == 0); - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -11396,7 +11396,7 @@ static void ggml_compute_forward_get_rows_f32( assert(params->ith == 0); - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -11499,14 +11499,14 @@ static void ggml_compute_forward_get_rows_back_f32_f16( // ggml_compute_forward_dup_same_cont(params, opt0, dst); - if (params->type == GGML_TASK_INIT) { + if (params->type == GGML_TASK_TYPE_INIT) { if (params->ith != 0) { return; } memset(dst->data, 0, ggml_nbytes(dst)); } - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -11538,14 +11538,14 @@ static void ggml_compute_forward_get_rows_back_f32( // ggml_compute_forward_dup_same_cont(params, opt0, dst); - if (params->type == GGML_TASK_INIT) { + if (params->type == GGML_TASK_TYPE_INIT) { if (params->ith != 0) { return; } memset(dst->data, 0, ggml_nbytes(dst)); } - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -11615,7 +11615,7 @@ static void ggml_compute_forward_diag_f32( GGML_ASSERT(params->ith == 0); - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -11684,7 +11684,7 @@ static void ggml_compute_forward_diag_mask_f32( GGML_ASSERT(n_past >= 0); - if (!inplace && (params->type == GGML_TASK_INIT)) { + if (!inplace && (params->type == GGML_TASK_TYPE_INIT)) { if (ith != 0) { return; } @@ -11698,7 +11698,7 @@ static void ggml_compute_forward_diag_mask_f32( ggml_nbytes(dst)); } - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -11772,7 +11772,7 @@ static void ggml_compute_forward_soft_max_f32( assert(ggml_is_contiguous(dst)); assert(ggml_are_same_shape(src0, dst)); - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -11910,7 +11910,7 @@ static void ggml_compute_forward_soft_max_back_f32( GGML_ASSERT(ggml_are_same_shape(src0, dst)); GGML_ASSERT(ggml_are_same_shape(src1, dst)); - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -12004,7 +12004,7 @@ static void ggml_compute_forward_alibi_f32( assert(params->ith == 0); - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -12063,7 +12063,7 @@ static void ggml_compute_forward_alibi_f16( assert(params->ith == 0); - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -12170,7 +12170,7 @@ static void ggml_compute_forward_clamp_f32( assert(params->ith == 0); - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -12310,7 +12310,7 @@ static void ggml_compute_forward_rope_f32( const struct ggml_tensor * src0 = dst->src[0]; const struct ggml_tensor * src1 = dst->src[1]; - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -12488,7 +12488,7 @@ static void ggml_compute_forward_rope_f16( const struct ggml_tensor * src0 = dst->src[0]; const struct ggml_tensor * src1 = dst->src[1]; - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -12719,7 +12719,7 @@ static void ggml_compute_forward_conv_transpose_1d_f16_f32( GGML_ASSERT(nb00 == sizeof(ggml_fp16_t)); GGML_ASSERT(nb10 == sizeof(float)); - if (params->type == GGML_TASK_INIT) { + if (params->type == GGML_TASK_TYPE_INIT) { if (ith != 0) { return; } @@ -12759,7 +12759,7 @@ static void ggml_compute_forward_conv_transpose_1d_f16_f32( return; } - if (params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -12818,7 +12818,7 @@ static void ggml_compute_forward_conv_transpose_1d_f32( GGML_ASSERT(nb00 == sizeof(float)); GGML_ASSERT(nb10 == sizeof(float)); - if (params->type == GGML_TASK_INIT) { + if (params->type == GGML_TASK_TYPE_INIT) { if (ith != 0) { return; } @@ -12858,7 +12858,7 @@ static void ggml_compute_forward_conv_transpose_1d_f32( return; } - if (params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -12962,11 +12962,11 @@ static void ggml_compute_forward_im2col_f32( GGML_ASSERT(nb00 == sizeof(ggml_fp16_t)); GGML_ASSERT(nb10 == sizeof(float)); - if (params->type == GGML_TASK_INIT) { + if (params->type == GGML_TASK_TYPE_INIT) { return; } - if (params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -13050,11 +13050,11 @@ static void ggml_compute_forward_im2col_f16( GGML_ASSERT(nb00 == sizeof(ggml_fp16_t)); GGML_ASSERT(nb10 == sizeof(float)); - if (params->type == GGML_TASK_INIT) { + if (params->type == GGML_TASK_TYPE_INIT) { return; } - if (params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -13136,7 +13136,7 @@ static void ggml_compute_forward_conv_transpose_2d( GGML_ASSERT(nb00 == sizeof(ggml_fp16_t)); GGML_ASSERT(nb10 == sizeof(float)); - if (params->type == GGML_TASK_INIT) { + if (params->type == GGML_TASK_TYPE_INIT) { if (ith != 0) { return; } @@ -13178,7 +13178,7 @@ static void ggml_compute_forward_conv_transpose_2d( return; } - if (params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -13230,7 +13230,7 @@ static void ggml_compute_forward_pool_1d_sk_p0( assert(src->type == GGML_TYPE_F32); assert(params->ith == 0); - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -13299,7 +13299,7 @@ static void ggml_compute_forward_pool_2d( GGML_ASSERT(src->type == GGML_TYPE_F32); GGML_ASSERT(params->ith == 0); - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -13372,7 +13372,7 @@ static void ggml_compute_forward_upscale_f32( const struct ggml_tensor * src0 = dst->src[0]; - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -13432,7 +13432,7 @@ static void ggml_compute_forward_pad_f32( const struct ggml_tensor * src0 = dst->src[0]; - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -13493,7 +13493,7 @@ static void ggml_compute_forward_argsort_f32( const struct ggml_tensor * src0 = dst->src[0]; - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -13519,8 +13519,8 @@ static void ggml_compute_forward_argsort_f32( // C doesn't have a functional sort, so we do a bubble sort instead for (int64_t j = 0; j < ne0; j++) { for (int64_t k = j + 1; k < ne0; k++) { - if ((order == GGML_SORT_ASC && src_data[dst_data[j]] > src_data[dst_data[k]]) || - (order == GGML_SORT_DESC && src_data[dst_data[j]] < src_data[dst_data[k]])) { + if ((order == GGML_SORT_ORDER_ASC && src_data[dst_data[j]] > src_data[dst_data[k]]) || + (order == GGML_SORT_ORDER_DESC && src_data[dst_data[j]] < src_data[dst_data[k]])) { int32_t tmp = dst_data[j]; dst_data[j] = dst_data[k]; dst_data[k] = tmp; @@ -13603,11 +13603,11 @@ static void ggml_compute_forward_flash_attn_f32( GGML_ASSERT(nb1 <= nb2); GGML_ASSERT(nb2 <= nb3); - if (params->type == GGML_TASK_INIT) { + if (params->type == GGML_TASK_TYPE_INIT) { return; } - if (params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -13795,11 +13795,11 @@ static void ggml_compute_forward_flash_attn_f16( GGML_ASSERT(nb1 <= nb2); GGML_ASSERT(nb2 <= nb3); - if (params->type == GGML_TASK_INIT) { + if (params->type == GGML_TASK_TYPE_INIT) { return; } - if (params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -14054,11 +14054,11 @@ static void ggml_compute_forward_flash_ff_f16( GGML_ASSERT(nb1 <= nb2); GGML_ASSERT(nb2 <= nb3); - if (params->type == GGML_TASK_INIT) { + if (params->type == GGML_TASK_TYPE_INIT) { return; } - if (params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -14213,14 +14213,14 @@ static void ggml_compute_forward_flash_attn_back_f32( GGML_ASSERT(nb1 <= nb2); GGML_ASSERT(nb2 <= nb3); - if (params->type == GGML_TASK_INIT) { + if (params->type == GGML_TASK_TYPE_INIT) { if (ith == 0) { memset(dst->data, 0, nb0*ne0*ne1*ne2*ne3); } return; } - if (params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -14536,7 +14536,7 @@ static void ggml_compute_forward_win_part_f32( const struct ggml_tensor * src0 = dst->src[0]; - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -14602,7 +14602,7 @@ static void ggml_compute_forward_win_unpart_f32( const struct ggml_tensor * src0 = dst->src[0]; - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -14730,7 +14730,7 @@ static void ggml_compute_forward_get_rel_pos_f16( const struct ggml_tensor * src0 = dst->src[0]; - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -14782,14 +14782,14 @@ static void ggml_compute_forward_add_rel_pos_f32( const struct ggml_tensor * src2 = dst->src[2]; const bool inplace = (bool) ((int32_t *) dst->op_params)[0]; - if (!inplace && params->type == GGML_TASK_INIT) { + if (!inplace && params->type == GGML_TASK_TYPE_INIT) { if (params->ith != 0) { return; } memcpy((char *) dst->data, (char *) src0->data, ggml_nbytes(dst)); return; } - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -14871,7 +14871,7 @@ static void ggml_compute_forward_map_unary_f32( GGML_ASSERT(ggml_are_same_shape(src0, dst)); - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -14920,7 +14920,7 @@ static void ggml_compute_forward_map_binary_f32( assert(params->ith == 0); assert(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst)); - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -14969,7 +14969,7 @@ static void ggml_compute_forward_map_custom1_f32( assert(params->ith == 0); - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -14988,7 +14988,7 @@ static void ggml_compute_forward_map_custom2_f32( assert(params->ith == 0); - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -15008,7 +15008,7 @@ static void ggml_compute_forward_map_custom3_f32( assert(params->ith == 0); - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -15023,7 +15023,7 @@ static void ggml_compute_forward_map_custom1( const struct ggml_tensor * a = dst->src[0]; - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -15041,7 +15041,7 @@ static void ggml_compute_forward_map_custom2( const struct ggml_tensor * a = dst->src[0]; const struct ggml_tensor * b = dst->src[1]; - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -15060,7 +15060,7 @@ static void ggml_compute_forward_map_custom3( const struct ggml_tensor * b = dst->src[1]; const struct ggml_tensor * c = dst->src[2]; - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -15094,14 +15094,14 @@ static void ggml_compute_forward_cross_entropy_loss_f32( GGML_ASSERT(params->wsize >= sizeof(float) * (nth + nth * nc)); - if (params->type == GGML_TASK_INIT) { + if (params->type == GGML_TASK_TYPE_INIT) { if (ith == 0) { memset(sums, 0, sizeof(float) * (nth + nth * nc)); } return; } - if (params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_FINALIZE) { if (ith == 0) { float * dp = (float *) dst->data; ggml_vec_sum_f32(nth, dp, sums); @@ -15216,7 +15216,7 @@ static void ggml_compute_forward_cross_entropy_loss_back_f32( const int64_t ith = params->ith; const int64_t nth = params->nth; - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -15323,8 +15323,8 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm if (skip_cpu) { return; } - GGML_ASSERT(tensor->src[0] == NULL || tensor->src[0]->backend == GGML_BACKEND_CPU); - GGML_ASSERT(tensor->src[1] == NULL || tensor->src[1]->backend == GGML_BACKEND_CPU); + GGML_ASSERT(tensor->src[0] == NULL || tensor->src[0]->backend == GGML_BACKEND_TYPE_CPU); + GGML_ASSERT(tensor->src[1] == NULL || tensor->src[1]->backend == GGML_BACKEND_TYPE_CPU); #elif defined(GGML_USE_VULKAN) const bool skip_cpu = ggml_vk_compute_forward_cpu_assist(params, tensor); #ifdef GGML_VULKAN_CHECK_RESULTS @@ -15335,8 +15335,8 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm if (skip_cpu) { return; } - GGML_ASSERT(tensor->src[0] == NULL || tensor->src[0]->backend == GGML_BACKEND_CPU); - GGML_ASSERT(tensor->src[1] == NULL || tensor->src[1]->backend == GGML_BACKEND_CPU); + GGML_ASSERT(tensor->src[0] == NULL || tensor->src[0]->backend == GGML_BACKEND_TYPE_CPU); + GGML_ASSERT(tensor->src[1] == NULL || tensor->src[1]->backend == GGML_BACKEND_TYPE_CPU); #endif // GGML_USE_CUBLAS #ifdef GGML_USE_SYCL @@ -16882,7 +16882,7 @@ size_t ggml_graph_overhead(void) { struct ggml_cgraph * ggml_new_graph_custom(struct ggml_context * ctx, size_t size, bool grads) { const size_t obj_size = ggml_graph_nbytes(size, grads); - struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_GRAPH, obj_size); + struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_TYPE_GRAPH, obj_size); struct ggml_cgraph * cgraph = (struct ggml_cgraph *) ((char *) ctx->mem_buffer + obj->offs); struct ggml_tensor ** data_start = (struct ggml_tensor **) (cgraph + 1); @@ -17429,7 +17429,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) { set_numa_thread_affinity(state->ith); int node_n = -1; - int task_phase = GGML_TASK_FINALIZE; + int task_phase = GGML_TASK_TYPE_FINALIZE; while (true) { if (cplan->abort_callback && cplan->abort_callback(cplan->abort_callback_data)) { @@ -17441,7 +17441,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) { // all other threads are finished and spinning // do finalize and init here so we don't have synchronize again struct ggml_compute_params params = { - /*.type =*/ GGML_TASK_FINALIZE, + /*.type =*/ GGML_TASK_TYPE_FINALIZE, /*.ith =*/ 0, /*.nth =*/ 0, /*.wsize =*/ cplan->work_size, @@ -17472,17 +17472,17 @@ static thread_ret_t ggml_graph_compute_thread(void * data) { if (n_tasks == 1) { /* INIT */ if (GGML_OP_HAS_INIT[node->op]) { - params.type = GGML_TASK_INIT; + params.type = GGML_TASK_TYPE_INIT; ggml_compute_forward(¶ms, node); } // TODO: maybe push node_n to the atomic but if other threads see n_tasks is 1, // they do something more efficient than spinning (?) - params.type = GGML_TASK_COMPUTE; + params.type = GGML_TASK_TYPE_COMPUTE; ggml_compute_forward(¶ms, node); if (GGML_OP_HAS_FINALIZE[node->op]) { - params.type = GGML_TASK_FINALIZE; + params.type = GGML_TASK_TYPE_FINALIZE; ggml_compute_forward(¶ms, node); } @@ -17496,7 +17496,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) { } } - task_phase = GGML_TASK_INIT; + task_phase = GGML_TASK_TYPE_INIT; atomic_store(&state->shared->n_active, n_threads); atomic_store(&state->shared->node_n, node_n); atomic_store(&state->shared->node_task, task_phase); @@ -17513,7 +17513,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) { const int n_tasks = ggml_get_n_tasks(node, n_threads); struct ggml_compute_params params = { - /*.type =*/ GGML_TASK_INIT, + /*.type =*/ GGML_TASK_TYPE_INIT, /*.ith =*/ state->ith, /*.nth =*/ n_tasks, /*.wsize =*/ cplan->work_size, @@ -17527,7 +17527,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) { } if (atomic_fetch_sub(&state->shared->n_active, 1) == 1) { - task_phase = GGML_TASK_COMPUTE; + task_phase = GGML_TASK_TYPE_COMPUTE; atomic_store(&state->shared->n_active, n_threads); atomic_store(&state->shared->node_task, task_phase); } @@ -17542,12 +17542,12 @@ static thread_ret_t ggml_graph_compute_thread(void * data) { } if (state->ith < n_tasks) { - params.type = GGML_TASK_COMPUTE; + params.type = GGML_TASK_TYPE_COMPUTE; ggml_compute_forward(¶ms, node); } if (atomic_fetch_sub(&state->shared->n_active, 1) == 1) { - task_phase = GGML_TASK_FINALIZE; + task_phase = GGML_TASK_TYPE_FINALIZE; atomic_store(&state->shared->n_active, n_threads); atomic_store(&state->shared->node_task, task_phase); } @@ -17783,7 +17783,7 @@ int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) { /*.n_threads =*/ n_threads, /*.n_active =*/ n_threads, /*.node_n =*/ -1, - /*.node_task =*/ GGML_TASK_FINALIZE, + /*.node_task =*/ GGML_TASK_TYPE_FINALIZE, /*.abort_callback =*/ NULL, /*.abort_callback_data =*/ NULL, }; @@ -17851,7 +17851,7 @@ int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) { void ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads) { struct ggml_cplan cplan = ggml_graph_plan(cgraph, n_threads); - struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_WORK_BUFFER, cplan.work_size); + struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_TYPE_WORK_BUFFER, cplan.work_size); cplan.work_data = (uint8_t *)ctx->mem_buffer + obj->offs; @@ -18659,7 +18659,7 @@ static enum ggml_opt_result ggml_opt_adam( float * pf = params.past > 0 ? opt->adam.pf->data : NULL; // past function values struct ggml_cplan cplan = ggml_graph_plan(gb, params.n_threads); - struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_WORK_BUFFER, cplan.work_size); + struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_TYPE_WORK_BUFFER, cplan.work_size); cplan.work_data = (uint8_t *)ctx->mem_buffer + obj->offs; bool cancel = false; @@ -18671,7 +18671,7 @@ static enum ggml_opt_result ggml_opt_adam( if (callback) { callback(callback_data, accum_step, &sched, &cancel); if (cancel) { - return GGML_OPT_CANCEL; + return GGML_OPT_RESULT_CANCEL; } } // ggml_graph_reset (gf); @@ -18762,7 +18762,7 @@ static enum ggml_opt_result ggml_opt_adam( if (callback) { callback(callback_data, accum_step, &sched, &cancel); if (cancel) { - return GGML_OPT_CANCEL;; + return GGML_OPT_RESULT_CANCEL;; } } // ggml_graph_reset (gf); @@ -18779,7 +18779,7 @@ static enum ggml_opt_result ggml_opt_adam( if (fabsf(fx - fx_prev[0])/fx < params.adam.eps_f) { GGML_PRINT_DEBUG("converged\n"); - return GGML_OPT_OK; + return GGML_OPT_RESULT_OK; } // delta-based convergence test @@ -18789,7 +18789,7 @@ static enum ggml_opt_result ggml_opt_adam( const float rate = (pf[(iter0 + t)%params.past] - fx)/fx; if (fabsf(rate) < params.delta) { - return GGML_OPT_OK; + return GGML_OPT_RESULT_OK; } } @@ -18805,7 +18805,7 @@ static enum ggml_opt_result ggml_opt_adam( ++n_no_improvement[0]; if (n_no_improvement[0] >= params.max_no_improvement) { - return GGML_OPT_OK; + return GGML_OPT_RESULT_OK; } } } @@ -18823,7 +18823,7 @@ static enum ggml_opt_result ggml_opt_adam( } } - return GGML_OPT_DID_NOT_CONVERGE; + return GGML_OPT_RESULT_DID_NOT_CONVERGE; } // @@ -18904,7 +18904,7 @@ static enum ggml_opt_result linesearch_backtracking( float sched = 0; callback(callback_data, accum_step, &sched, cancel); if (*cancel) { - return GGML_OPT_CANCEL; + return GGML_OPT_RESULT_CANCEL; } } // ggml_graph_reset (gf); @@ -18977,7 +18977,7 @@ static enum ggml_opt_result ggml_opt_lbfgs( if (params.lbfgs.linesearch == GGML_LINESEARCH_BACKTRACKING_WOLFE || params.lbfgs.linesearch == GGML_LINESEARCH_BACKTRACKING_STRONG_WOLFE) { if (params.lbfgs.wolfe <= params.lbfgs.ftol || 1.f <= params.lbfgs.wolfe) { - return GGML_OPT_INVALID_WOLFE; + return GGML_OPT_RESULT_INVALID_WOLFE; } } @@ -19006,7 +19006,7 @@ static enum ggml_opt_result ggml_opt_lbfgs( } struct ggml_cplan cplan = ggml_graph_plan(gb, params.n_threads); - struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_WORK_BUFFER, cplan.work_size); + struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_TYPE_WORK_BUFFER, cplan.work_size); cplan.work_data = (uint8_t *)ctx->mem_buffer + obj->offs; float * x = opt->lbfgs.x->data; // current parameters @@ -19047,7 +19047,7 @@ static enum ggml_opt_result ggml_opt_lbfgs( float sched = 0; callback(callback_data, accum_step, &sched, &cancel); if (cancel) { - return GGML_OPT_CANCEL; + return GGML_OPT_RESULT_CANCEL; } } // ggml_graph_reset (gf); @@ -19075,7 +19075,7 @@ static enum ggml_opt_result ggml_opt_lbfgs( // already optimized if (gnorm/xnorm <= params.lbfgs.eps) { - return GGML_OPT_OK; + return GGML_OPT_RESULT_OK; } if (opt->just_initialized) { @@ -19120,7 +19120,7 @@ static enum ggml_opt_result ggml_opt_lbfgs( // way to test and don't want to break something with so many changes lined up ls = linesearch_backtracking(¶ms, nx, x, &fx, g, d, step, xp, f, gb, &cplan, np, ps, &cancel, callback, callback_data); if (cancel) { - return GGML_OPT_CANCEL; + return GGML_OPT_RESULT_CANCEL; } if (ls < 0) { @@ -19143,7 +19143,7 @@ static enum ggml_opt_result ggml_opt_lbfgs( } if (gnorm/xnorm <= params.lbfgs.eps) { // converged - return GGML_OPT_OK; + return GGML_OPT_RESULT_OK; } // delta-based convergence test @@ -19153,7 +19153,7 @@ static enum ggml_opt_result ggml_opt_lbfgs( const float rate = (pf[k[0]%params.past] - fx)/fx; if (fabsf(rate) < params.delta) { - return GGML_OPT_OK; + return GGML_OPT_RESULT_OK; } } @@ -19169,14 +19169,14 @@ static enum ggml_opt_result ggml_opt_lbfgs( n_no_improvement[0]++; if (n_no_improvement[0] >= params.max_no_improvement) { - return GGML_OPT_OK; + return GGML_OPT_RESULT_OK; } } } if (params.lbfgs.n_iter != 0 && params.lbfgs.n_iter < it + 1) { // reached the maximum number of iterations - return GGML_OPT_DID_NOT_CONVERGE; + return GGML_OPT_RESULT_DID_NOT_CONVERGE; } // update vectors s and y: @@ -19232,17 +19232,17 @@ static enum ggml_opt_result ggml_opt_lbfgs( GGML_ASSERT(false && "lbfgs failed"); - return GGML_OPT_DID_NOT_CONVERGE; + return GGML_OPT_RESULT_DID_NOT_CONVERGE; } struct ggml_opt_params ggml_opt_default_params(enum ggml_opt_type type) { struct ggml_opt_params result; switch (type) { - case GGML_OPT_ADAM: + case GGML_OPT_TYPE_ADAM: { result = (struct ggml_opt_params) { - .type = GGML_OPT_ADAM, + .type = GGML_OPT_TYPE_ADAM, .graph_size = GGML_DEFAULT_GRAPH_SIZE, .n_threads = 1, // FIXME: GGML_DEFAULT_N_THREADS ? .past = 0, @@ -19270,10 +19270,10 @@ struct ggml_opt_params ggml_opt_default_params(enum ggml_opt_type type) { }, }; } break; - case GGML_OPT_LBFGS: + case GGML_OPT_TYPE_LBFGS: { result = (struct ggml_opt_params) { - .type = GGML_OPT_LBFGS, + .type = GGML_OPT_TYPE_LBFGS, .graph_size = GGML_DEFAULT_GRAPH_SIZE, .n_threads = 1, .past = 0, @@ -19318,12 +19318,12 @@ GGML_API void ggml_opt_init( opt->just_initialized = true; if (opt->ctx == NULL) { struct ggml_init_params ctx_opt_params; - if (opt->params.type == GGML_OPT_ADAM) { + if (opt->params.type == GGML_OPT_TYPE_ADAM) { ctx_opt_params.mem_size = GGML_MEM_ALIGN*3 + ggml_tensor_overhead()*3 + ggml_type_size(GGML_TYPE_F32)*nx*3; if (opt->params.past > 0) { ctx_opt_params.mem_size += GGML_MEM_ALIGN + ggml_tensor_overhead() + ggml_type_size(GGML_TYPE_F32)*opt->params.past; } - } else if (opt->params.type == GGML_OPT_LBFGS) { + } else if (opt->params.type == GGML_OPT_TYPE_LBFGS) { ctx_opt_params.mem_size = GGML_MEM_ALIGN*9 + ggml_tensor_overhead()*9 + ggml_type_size(GGML_TYPE_F32)*(nx*5 + opt->params.lbfgs.m*2 + nx*opt->params.lbfgs.m*2); if (opt->params.past > 0) { ctx_opt_params.mem_size += GGML_MEM_ALIGN + ggml_tensor_overhead() + ggml_type_size(GGML_TYPE_F32)*opt->params.past; @@ -19335,7 +19335,7 @@ GGML_API void ggml_opt_init( opt->ctx = ggml_init(ctx_opt_params); } switch (opt->params.type) { - case GGML_OPT_ADAM: + case GGML_OPT_TYPE_ADAM: { opt->adam.g = ggml_new_tensor_1d(opt->ctx, GGML_TYPE_F32, nx); opt->adam.m = ggml_new_tensor_1d(opt->ctx, GGML_TYPE_F32, nx); @@ -19349,7 +19349,7 @@ GGML_API void ggml_opt_init( ggml_set_zero(opt->adam.pf); } } break; - case GGML_OPT_LBFGS: + case GGML_OPT_TYPE_LBFGS: { opt->lbfgs.x = ggml_new_tensor_1d(opt->ctx, GGML_TYPE_F32, nx); opt->lbfgs.xp = ggml_new_tensor_1d(opt->ctx, GGML_TYPE_F32, nx); @@ -19393,13 +19393,13 @@ enum ggml_opt_result ggml_opt( ctx = ggml_init(params_ctx); if (ctx == NULL) { - return GGML_OPT_NO_CONTEXT; + return GGML_OPT_RESULT_NO_CONTEXT; } free_ctx = true; } - enum ggml_opt_result result = GGML_OPT_OK; + enum ggml_opt_result result = GGML_OPT_RESULT_OK; struct ggml_opt_context * opt = (struct ggml_opt_context *) alloca(sizeof(struct ggml_opt_context)); @@ -19438,14 +19438,14 @@ enum ggml_opt_result ggml_opt_resume_g( void * callback_data) { // build forward + backward compute graphs - enum ggml_opt_result result = GGML_OPT_OK; + enum ggml_opt_result result = GGML_OPT_RESULT_OK; switch (opt->params.type) { - case GGML_OPT_ADAM: + case GGML_OPT_TYPE_ADAM: { result = ggml_opt_adam(ctx, opt, opt->params, f, gf, gb, callback, callback_data); } break; - case GGML_OPT_LBFGS: + case GGML_OPT_TYPE_LBFGS: { result = ggml_opt_lbfgs(ctx, opt, opt->params, f, gf, gb, callback, callback_data); } break; diff --git a/ggml.h b/ggml.h index a4166e1f7afd0..75fd035a4698f 100644 --- a/ggml.h +++ b/ggml.h @@ -364,9 +364,9 @@ extern "C" { }; enum ggml_backend_type { - GGML_BACKEND_CPU = 0, - GGML_BACKEND_GPU = 10, - GGML_BACKEND_GPU_SPLIT = 20, + GGML_BACKEND_TYPE_CPU = 0, + GGML_BACKEND_TYPE_GPU = 10, + GGML_BACKEND_TYPE_GPU_SPLIT = 20, }; // model file types @@ -498,9 +498,9 @@ extern "C" { }; enum ggml_object_type { - GGML_OBJECT_TENSOR, - GGML_OBJECT_GRAPH, - GGML_OBJECT_WORK_BUFFER + GGML_OBJECT_TYPE_TENSOR, + GGML_OBJECT_TYPE_GRAPH, + GGML_OBJECT_TYPE_WORK_BUFFER }; enum ggml_log_level { @@ -642,9 +642,9 @@ extern "C" { // NOTE: the INIT or FINALIZE pass is not scheduled unless explicitly enabled. // This behavior was changed since https://github.com/ggerganov/llama.cpp/pull/1995. enum ggml_task_type { - GGML_TASK_INIT = 0, - GGML_TASK_COMPUTE, - GGML_TASK_FINALIZE, + GGML_TASK_TYPE_INIT = 0, + GGML_TASK_TYPE_COMPUTE, + GGML_TASK_TYPE_FINALIZE, }; struct ggml_compute_params { @@ -1649,8 +1649,8 @@ extern "C" { // sort rows enum ggml_sort_order { - GGML_SORT_ASC, - GGML_SORT_DESC, + GGML_SORT_ORDER_ASC, + GGML_SORT_ORDER_DESC, }; GGML_API struct ggml_tensor * ggml_argsort( @@ -1943,8 +1943,8 @@ extern "C" { // optimization methods enum ggml_opt_type { - GGML_OPT_ADAM, - GGML_OPT_LBFGS, + GGML_OPT_TYPE_ADAM, + GGML_OPT_TYPE_LBFGS, }; // linesearch methods @@ -1958,12 +1958,12 @@ extern "C" { // optimization return values enum ggml_opt_result { - GGML_OPT_OK = 0, - GGML_OPT_DID_NOT_CONVERGE, - GGML_OPT_NO_CONTEXT, - GGML_OPT_INVALID_WOLFE, - GGML_OPT_FAIL, - GGML_OPT_CANCEL, + GGML_OPT_RESULT_OK = 0, + GGML_OPT_RESULT_DID_NOT_CONVERGE, + GGML_OPT_RESULT_NO_CONTEXT, + GGML_OPT_RESULT_INVALID_WOLFE, + GGML_OPT_RESULT_FAIL, + GGML_OPT_RESULT_CANCEL, GGML_LINESEARCH_FAIL = -128, GGML_LINESEARCH_MINIMUM_STEP, diff --git a/llama.cpp b/llama.cpp index 1f6b6cff48987..acd9be08a6e5e 100644 --- a/llama.cpp +++ b/llama.cpp @@ -850,9 +850,9 @@ struct LLM_TN { // static std::map LLAMA_ROPE_SCALING_TYPES = { - { LLAMA_ROPE_SCALING_NONE, "none" }, - { LLAMA_ROPE_SCALING_LINEAR, "linear" }, - { LLAMA_ROPE_SCALING_YARN, "yarn" }, + { LLAMA_ROPE_SCALING_TYPE_NONE, "none" }, + { LLAMA_ROPE_SCALING_TYPE_LINEAR, "linear" }, + { LLAMA_ROPE_SCALING_TYPE_YARN, "yarn" }, }; static int32_t llama_rope_scaling_type_from_string(const std::string & name) { @@ -862,7 +862,7 @@ static int32_t llama_rope_scaling_type_from_string(const std::string & name) { } } - return LLAMA_ROPE_SCALING_UNSPECIFIED; + return LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED; } static std::string gguf_data_to_str(enum gguf_type type, const void * data, int i) { @@ -1580,7 +1580,7 @@ struct llama_hparams { bool causal_attn = true; bool need_kq_pos = false; - uint32_t pooling_type = LLAMA_POOLING_NONE; + uint32_t pooling_type = LLAMA_POOLING_TYPE_NONE; bool operator!=(const llama_hparams & other) const { if (this->vocab_only != other.vocab_only) return true; @@ -2345,9 +2345,9 @@ namespace GGUFMeta { static const char * override_type_to_str(const llama_model_kv_override_type ty) { switch (ty) { - case LLAMA_KV_OVERRIDE_BOOL: return "bool"; - case LLAMA_KV_OVERRIDE_INT: return "int"; - case LLAMA_KV_OVERRIDE_FLOAT: return "float"; + case LLAMA_KV_OVERRIDE_TYPE_BOOL: return "bool"; + case LLAMA_KV_OVERRIDE_TYPE_INT: return "int"; + case LLAMA_KV_OVERRIDE_TYPE_FLOAT: return "float"; } return "unknown"; } @@ -2358,13 +2358,13 @@ namespace GGUFMeta { LLAMA_LOG_INFO("%s: Using metadata override (%5s) '%s' = ", __func__, override_type_to_str(override->tag), override->key); switch (override->tag) { - case LLAMA_KV_OVERRIDE_BOOL: { + case LLAMA_KV_OVERRIDE_TYPE_BOOL: { LLAMA_LOG_INFO("%s\n", override->bool_value ? "true" : "false"); } break; - case LLAMA_KV_OVERRIDE_INT: { + case LLAMA_KV_OVERRIDE_TYPE_INT: { LLAMA_LOG_INFO("%" PRId64 "\n", override->int_value); } break; - case LLAMA_KV_OVERRIDE_FLOAT: { + case LLAMA_KV_OVERRIDE_TYPE_FLOAT: { LLAMA_LOG_INFO("%.6f\n", override->float_value); } break; default: @@ -2383,7 +2383,7 @@ namespace GGUFMeta { template static typename std::enable_if::value, bool>::type try_override(OT & target, const struct llama_model_kv_override *override) { - if (validate_override(LLAMA_KV_OVERRIDE_BOOL, override)) { + if (validate_override(LLAMA_KV_OVERRIDE_TYPE_BOOL, override)) { target = override->bool_value; return true; } @@ -2393,7 +2393,7 @@ namespace GGUFMeta { template static typename std::enable_if::value && std::is_integral::value, bool>::type try_override(OT & target, const struct llama_model_kv_override *override) { - if (validate_override(LLAMA_KV_OVERRIDE_INT, override)) { + if (validate_override(LLAMA_KV_OVERRIDE_TYPE_INT, override)) { target = override->int_value; return true; } @@ -2403,7 +2403,7 @@ namespace GGUFMeta { template static typename std::enable_if::value, bool>::type try_override(T & target, const struct llama_model_kv_override *override) { - if (validate_override(LLAMA_KV_OVERRIDE_FLOAT, override)) { + if (validate_override(LLAMA_KV_OVERRIDE_TYPE_FLOAT, override)) { target = override->float_value; return true; } @@ -2999,7 +2999,7 @@ static void llm_load_hparams( std::string rope_scaling("linear"); ml.get_key(LLM_KV_ROPE_SCALING_TYPE, rope_scaling, false); hparams.rope_scaling_type_train = llama_rope_scaling_type_from_string(rope_scaling); - GGML_ASSERT(hparams.rope_scaling_type_train != LLAMA_ROPE_SCALING_UNSPECIFIED); + GGML_ASSERT(hparams.rope_scaling_type_train != LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED); // rope_freq_scale (inverse of the kv) is optional float ropescale = 0.0f; @@ -3643,7 +3643,7 @@ static bool llm_load_tensors( model.buft_layer[i] = llama_default_buffer_type_cpu(true); } - if (split_mode == LLAMA_SPLIT_LAYER) { + if (split_mode == LLAMA_SPLIT_MODE_LAYER) { // calculate the split points int device_count = llama_get_device_count(); bool all_zero = tensor_split == nullptr || std::all_of(tensor_split, tensor_split + device_count, [](float x) { return x == 0.0f; }); @@ -3682,10 +3682,10 @@ static bool llm_load_tensors( } } else { ggml_backend_buffer_type_t split_buft; - if (split_mode == LLAMA_SPLIT_ROW) { + if (split_mode == LLAMA_SPLIT_MODE_ROW) { split_buft = llama_default_buffer_type_split(main_gpu, tensor_split); } else { - // LLAMA_SPLIT_NONE or LLAMA_SPLIT_LAYER in backends where it is not supported + // LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_LAYER in backends where it is not supported split_buft = llama_default_buffer_type_offload(main_gpu); } // assign the repeating layers @@ -5070,7 +5070,7 @@ struct llm_build_context { kv_head (worst_case ? n_ctx - n_tokens : kv_self.head), n_orig_ctx (cparams.n_yarn_orig_ctx), do_rope_shift (worst_case || kv_self.has_shift), - pooling_type (cparams.do_pooling ? hparams.pooling_type : (uint32_t)LLAMA_POOLING_NONE), + pooling_type (cparams.do_pooling ? hparams.pooling_type : (uint32_t)LLAMA_POOLING_TYPE_NONE), cb (cb), buf_compute_meta (lctx.buf_compute_meta) { // all initializations should be done in init() @@ -6050,12 +6050,12 @@ struct llm_build_context { cur = inpL; // pooling layer - if (pooling_type == LLAMA_POOLING_MEAN) { + if (pooling_type == LLAMA_POOLING_TYPE_MEAN) { cur = ggml_mul_mat(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, cur)), inp_mean); - } else if (pooling_type == LLAMA_POOLING_CLS) { + } else if (pooling_type == LLAMA_POOLING_TYPE_CLS) { cur = ggml_get_rows(ctx0, cur, inp_cls); } else { - GGML_ASSERT(pooling_type == LLAMA_POOLING_NONE && "Invalid pooling type"); + GGML_ASSERT(pooling_type == LLAMA_POOLING_TYPE_NONE && "Invalid pooling type"); } cb(cur, "result_embd", -1); @@ -7754,7 +7754,7 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) { } } - if (cparams.do_pooling && hparams.pooling_type == LLAMA_POOLING_MEAN) { + if (cparams.do_pooling && hparams.pooling_type == LLAMA_POOLING_TYPE_MEAN) { const int64_t n_tokens = batch.n_tokens; GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_mean->buffer)); @@ -7782,7 +7782,7 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) { } } - if (cparams.do_pooling && hparams.pooling_type == LLAMA_POOLING_CLS) { + if (cparams.do_pooling && hparams.pooling_type == LLAMA_POOLING_TYPE_CLS) { const int64_t n_tokens = batch.n_tokens; GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_cls->buffer)); @@ -11351,7 +11351,7 @@ static int llama_apply_lora_from_file_internal( struct llama_model_params llama_model_default_params() { struct llama_model_params result = { /*.n_gpu_layers =*/ 0, - /*.split_mode =*/ LLAMA_SPLIT_LAYER, + /*.split_mode =*/ LLAMA_SPLIT_MODE_LAYER, /*.main_gpu =*/ 0, /*.tensor_split =*/ nullptr, /*.progress_callback =*/ nullptr, @@ -11377,7 +11377,7 @@ struct llama_context_params llama_context_default_params() { /*.n_batch =*/ 512, /*.n_threads =*/ GGML_DEFAULT_N_THREADS, // TODO: better default /*.n_threads_batch =*/ GGML_DEFAULT_N_THREADS, - /*.rope_scaling_type =*/ LLAMA_ROPE_SCALING_UNSPECIFIED, + /*.rope_scaling_type =*/ LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED, /*.rope_freq_base =*/ 0.0f, /*.rope_freq_scale =*/ 0.0f, /*.yarn_ext_factor =*/ -1.0f, @@ -11565,16 +11565,16 @@ struct llama_context * llama_new_context_with_model( cparams.cb_eval_user_data = params.cb_eval_user_data; auto rope_scaling_type = params.rope_scaling_type; - if (rope_scaling_type == LLAMA_ROPE_SCALING_UNSPECIFIED) { + if (rope_scaling_type == LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED) { rope_scaling_type = hparams.rope_scaling_type_train; } - if (rope_scaling_type == LLAMA_ROPE_SCALING_NONE) { + if (rope_scaling_type == LLAMA_ROPE_SCALING_TYPE_NONE) { cparams.rope_freq_scale = 1.0f; // never scale if scaling type is none } if (cparams.yarn_ext_factor < 0.0f) { // negative indicates 'not set' - cparams.yarn_ext_factor = rope_scaling_type == LLAMA_ROPE_SCALING_YARN ? 1.0f : 0.0f; + cparams.yarn_ext_factor = rope_scaling_type == LLAMA_ROPE_SCALING_TYPE_YARN ? 1.0f : 0.0f; } if (params.seed == LLAMA_DEFAULT_SEED) { @@ -11608,8 +11608,8 @@ struct llama_context * llama_new_context_with_model( } #elif defined(GGML_USE_CUBLAS) if (model->n_gpu_layers > 0) { - // with split_mode LLAMA_SPLIT_NONE or LLAMA_SPLIT_ROW, only the main GPU backend is used - if (model->split_mode == LLAMA_SPLIT_NONE || model->split_mode == LLAMA_SPLIT_ROW) { + // with split_mode LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_ROW, only the main GPU backend is used + if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) { ggml_backend_t backend = ggml_backend_cuda_init(model->main_gpu); if (backend == nullptr) { LLAMA_LOG_ERROR("%s: failed to initialize CUDA%d backend\n", __func__, model->main_gpu); @@ -11618,7 +11618,7 @@ struct llama_context * llama_new_context_with_model( } ctx->backends.push_back(backend); } else { - // LLAMA_SPLIT_LAYER requires a backend for each GPU + // LLAMA_SPLIT_MODE_LAYER requires a backend for each GPU for (int device = 0; device < ggml_backend_cuda_get_device_count(); ++device) { ggml_backend_t backend = ggml_backend_cuda_init(device); if (backend == nullptr) { diff --git a/llama.h b/llama.h index 889edf4d97b96..947284ea2f535 100644 --- a/llama.h +++ b/llama.h @@ -109,23 +109,23 @@ extern "C" { }; enum llama_rope_scaling_type { - LLAMA_ROPE_SCALING_UNSPECIFIED = -1, - LLAMA_ROPE_SCALING_NONE = 0, - LLAMA_ROPE_SCALING_LINEAR = 1, - LLAMA_ROPE_SCALING_YARN = 2, - LLAMA_ROPE_SCALING_MAX_VALUE = LLAMA_ROPE_SCALING_YARN, + LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED = -1, + LLAMA_ROPE_SCALING_TYPE_NONE = 0, + LLAMA_ROPE_SCALING_TYPE_LINEAR = 1, + LLAMA_ROPE_SCALING_TYPE_YARN = 2, + LLAMA_ROPE_SCALING_TYPE_MAX_VALUE = LLAMA_ROPE_SCALING_TYPE_YARN, }; enum llama_pooling_type { - LLAMA_POOLING_NONE = 0, - LLAMA_POOLING_MEAN = 1, - LLAMA_POOLING_CLS = 2, + LLAMA_POOLING_TYPE_NONE = 0, + LLAMA_POOLING_TYPE_MEAN = 1, + LLAMA_POOLING_TYPE_CLS = 2, }; enum llama_split_mode { - LLAMA_SPLIT_NONE = 0, // single GPU - LLAMA_SPLIT_LAYER = 1, // split layers and KV across GPUs - LLAMA_SPLIT_ROW = 2, // split rows across GPUs + LLAMA_SPLIT_MODE_NONE = 0, // single GPU + LLAMA_SPLIT_MODE_LAYER = 1, // split layers and KV across GPUs + LLAMA_SPLIT_MODE_ROW = 2, // split rows across GPUs }; typedef struct llama_token_data { @@ -173,9 +173,9 @@ extern "C" { } llama_batch; enum llama_model_kv_override_type { - LLAMA_KV_OVERRIDE_INT, - LLAMA_KV_OVERRIDE_FLOAT, - LLAMA_KV_OVERRIDE_BOOL, + LLAMA_KV_OVERRIDE_TYPE_INT, + LLAMA_KV_OVERRIDE_TYPE_FLOAT, + LLAMA_KV_OVERRIDE_TYPE_BOOL, }; struct llama_model_kv_override { diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index f8574588bcf2f..24d12ef141efd 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -1264,7 +1264,7 @@ struct test_argsort : public test_case { test_argsort(ggml_type type = GGML_TYPE_F32, std::array ne = {16, 10, 10, 10}, - ggml_sort_order order = GGML_SORT_ASC) + ggml_sort_order order = GGML_SORT_ORDER_ASC) : type(type), ne(ne), order(order) {} ggml_tensor * build_graph(ggml_context * ctx) override { @@ -2116,7 +2116,7 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op test_cases.emplace_back(new test_concat(GGML_TYPE_F32)); test_cases.emplace_back(new test_concat(GGML_TYPE_I32)); - for (ggml_sort_order order : {GGML_SORT_ASC, GGML_SORT_DESC}) { + for (ggml_sort_order order : {GGML_SORT_ORDER_ASC, GGML_SORT_ORDER_DESC}) { test_cases.emplace_back(new test_argsort(GGML_TYPE_F32, {8, 1, 1, 1}, order)); test_cases.emplace_back(new test_argsort(GGML_TYPE_F32, {16, 10, 10, 10}, order)); } diff --git a/tests/test-opt.cpp b/tests/test-opt.cpp index 2c9997fca7705..546ca230ba417 100644 --- a/tests/test-opt.cpp +++ b/tests/test-opt.cpp @@ -118,7 +118,7 @@ int main(void) { const float fe = ggml_get_f32_1d(e, 0); printf("%s: e = %.4f\n", __func__, fe); - struct ggml_opt_params opt_params = ggml_opt_default_params(GGML_OPT_ADAM); + struct ggml_opt_params opt_params = ggml_opt_default_params(GGML_OPT_TYPE_ADAM); ggml_opt(ctx, opt_params, e); From 12894088170f62e4cad4f8d6a3043c185b414bab Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rados=C5=82aw=20Gryta?= Date: Sun, 25 Feb 2024 11:53:11 +0100 Subject: [PATCH 06/18] cmake : fix compilation for Android armeabi-v7a (#5702) --- CMakeLists.txt | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 3c4629001a38d..48880f7204bf5 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -936,10 +936,16 @@ if (CMAKE_OSX_ARCHITECTURES STREQUAL "arm64" OR CMAKE_GENERATOR_PLATFORM_LWR STR list(APPEND ARCH_FLAGS -mfpu=neon-fp-armv8 -mno-unaligned-access) endif() if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv7") - # Raspberry Pi 2 - list(APPEND ARCH_FLAGS -mfpu=neon-fp-armv8 -mno-unaligned-access -funsafe-math-optimizations) + if ("${CMAKE_SYSTEM_NAME}" STREQUAL "Android") + # Android armeabi-v7a + list(APPEND ARCH_FLAGS -mfpu=neon-vfpv4 -mno-unaligned-access -funsafe-math-optimizations) + else() + # Raspberry Pi 2 + list(APPEND ARCH_FLAGS -mfpu=neon-fp-armv8 -mno-unaligned-access -funsafe-math-optimizations) + endif() endif() if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv8") + # Android arm64-v8a # Raspberry Pi 3, 4, Zero 2 (32-bit) list(APPEND ARCH_FLAGS -mno-unaligned-access) endif() From a6aff3fba061985f2e7edec5ac828add7ed69444 Mon Sep 17 00:00:00 2001 From: Concedo <39025047+LostRuins@users.noreply.github.com> Date: Sun, 25 Feb 2024 19:40:40 +0800 Subject: [PATCH 07/18] fix typo --- koboldcpp.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/koboldcpp.py b/koboldcpp.py index debe4f62358b3..a0630c9f16d08 100644 --- a/koboldcpp.py +++ b/koboldcpp.py @@ -2682,7 +2682,7 @@ def start_in_seperate_process(launch_args): parser.add_argument("--forceversion", help="If the model file format detection fails (e.g. rogue modified model) you can set this to override the detected format (enter desired version, e.g. 401 for GPTNeoX-Type2).",metavar=('[version]'), type=int, default=0) parser.add_argument("--nommap", help="If set, do not use mmap to load newer models", action='store_true') parser.add_argument("--usemlock", help="For Apple Systems. Force system to keep model in RAM rather than swapping or compressing", action='store_true') - parser.add_argument("--noavx2", help="Do not use AVX2 instructions, a slower compatibility mode for older devices. Does not work with --clblast.", action='store_true') + parser.add_argument("--noavx2", help="Do not use AVX2 instructions, a slower compatibility mode for older devices.", action='store_true') parser.add_argument("--debugmode", help="Shows additional debug info in the terminal.", nargs='?', const=1, type=int, default=0) parser.add_argument("--skiplauncher", help="Doesn't display or use the GUI launcher.", action='store_true') parser.add_argument("--hordeconfig", help="Sets the display model name to something else, for easy use on AI Horde. Optional additional parameters set the horde max genlength, max ctxlen, API key and worker name.",metavar=('[hordemodelname]', '[hordegenlength] [hordemaxctx] [hordeapikey] [hordeworkername]'), nargs='+') From d52d7819b8ced70c642a88a59da8c78208dc58ec Mon Sep 17 00:00:00 2001 From: Pierrick Hymbert Date: Sun, 25 Feb 2024 13:49:43 +0100 Subject: [PATCH 08/18] server: concurrency fix + monitoring - add /metrics prometheus compatible endpoint (#5708) * server: monitoring - add /metrics prometheus compatible endpoint * server: concurrency issue, when 2 task are waiting for results, only one call thread is notified * server: metrics - move to a dedicated struct --- examples/server/README.md | 13 ++ examples/server/server.cpp | 150 +++++++++++++++++- examples/server/tests/features/environment.py | 2 + examples/server/tests/features/server.feature | 2 + examples/server/tests/features/steps/steps.py | 27 ++++ examples/server/tests/requirements.txt | 1 + examples/server/utils.hpp | 4 +- 7 files changed, 191 insertions(+), 8 deletions(-) diff --git a/examples/server/README.md b/examples/server/README.md index 0c43ac4c97cba..2129f7fb2b463 100644 --- a/examples/server/README.md +++ b/examples/server/README.md @@ -41,6 +41,7 @@ see https://github.com/ggerganov/llama.cpp/issues/1437 - `--grp-attn-w`: Set the group attention width to extend context size through self-extend(default: 512), used together with group attention factor `--grp-attn-n` - `-n, --n-predict`: Set the maximum tokens to predict (default: -1) - `--slots-endpoint-disable`: To disable slots state monitoring endpoint. Slots state may contain user data, prompts included. +- `--metrics`: enable prometheus `/metrics` compatible endpoint (default: disabled) - `--chat-template JINJA_TEMPLATE`: Set custom jinja chat template. This parameter accepts a string, not a file name (default: template taken from model's metadata). We only support [some pre-defined templates](https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template) ## Build @@ -457,6 +458,18 @@ Notice that each `probs` is an array of length `n_probs`. ] ``` +- **GET** `/metrics`: [Prometheus](https://prometheus.io/) compatible metrics exporter endpoint if `--metrics` is enabled: + +Available metrics: +- `llamacpp:prompt_tokens_total`: Number of prompt tokens processed. +- `llamacpp:tokens_predicted_total`: Number of generation tokens processed. +- `llamacpp:prompt_tokens_seconds`: Average prompt throughput in tokens/s. +- `llamacpp:predicted_tokens_seconds`: Average generation throughput in tokens/s. +- `llamacpp:kv_cache_usage_ratio`: KV-cache usage. 1 means 100 percent usage. +- `llamacpp:kv_cache_tokens`: KV-cache tokens. +- `llamacpp:requests_processing`: Number of request processing. +- `llamacpp:requests_deferred`: Number of request deferred. + ## More examples ### Change system prompt on runtime diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 780862ef67810..811495915f6dd 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -43,6 +43,7 @@ struct server_params int32_t read_timeout = 600; int32_t write_timeout = 600; bool slots_endpoint = true; + bool metrics_endpoint = false; }; bool server_verbose = false; @@ -310,6 +311,39 @@ struct llama_client_slot } }; +struct llama_metrics { + uint64_t n_prompt_tokens_processed_total = 0; + uint64_t n_tokens_predicted_total = 0; + + uint64_t n_prompt_tokens_processed = 0; + uint64_t t_prompt_processing = 0; + + uint64_t n_tokens_predicted = 0; + uint64_t t_tokens_generation = 0; + + + void on_prompt_eval(const llama_client_slot &slot) { + n_prompt_tokens_processed_total += slot.num_prompt_tokens_processed; + + n_prompt_tokens_processed += slot.num_prompt_tokens_processed; + t_prompt_processing += slot.t_prompt_processing; + } + + void on_prediction(const llama_client_slot &slot) { + n_tokens_predicted_total += slot.n_decoded; + + n_tokens_predicted += slot.n_decoded; + t_tokens_generation += slot.t_token_generation; + } + + void reset_bucket() { + n_prompt_tokens_processed = 0; + t_prompt_processing = 0; + n_tokens_predicted = 0; + t_tokens_generation = 0; + } +}; + struct llama_server_context { llama_model *model = nullptr; @@ -344,6 +378,8 @@ struct llama_server_context llama_server_queue queue_tasks; llama_server_response queue_results; + llama_metrics metrics; + ~llama_server_context() { if (ctx) @@ -1404,7 +1440,7 @@ struct llama_server_context case TASK_TYPE_NEXT_RESPONSE: { // do nothing } break; - case TASK_TYPE_SLOTS_DATA: { + case TASK_TYPE_METRICS: { json slots_data = json::array(); int n_idle_slots = 0; int n_processing_slots = 0; @@ -1438,10 +1474,24 @@ struct llama_server_context res.stop = true; res.error = false; res.result_json = { - { "idle", n_idle_slots }, - { "processing", n_processing_slots }, - { "slots", slots_data } + { "idle", n_idle_slots }, + { "processing", n_processing_slots }, + { "deferred", queue_tasks.queue_tasks_deferred.size() }, + + { "n_prompt_tokens_processed_total", metrics.n_prompt_tokens_processed_total}, + { "n_tokens_predicted_total", metrics.n_tokens_predicted_total}, + + { "n_prompt_tokens_processed", metrics.n_prompt_tokens_processed}, + { "t_prompt_processing", metrics.t_prompt_processing}, + { "n_tokens_predicted", metrics.n_tokens_predicted}, + { "t_tokens_generation", metrics.t_tokens_generation}, + + { "kv_cache_tokens_count", llama_get_kv_cache_token_count(ctx)}, + { "kv_cache_used_cells", llama_get_kv_cache_used_cells(ctx)}, + + { "slots", slots_data }, }; + metrics.reset_bucket(); queue_results.send(res); } break; } @@ -1849,6 +1899,7 @@ struct llama_server_context { slot.t_start_genereration = ggml_time_us(); slot.t_prompt_processing = (slot.t_start_genereration - slot.t_start_process_prompt) / 1e3; + metrics.on_prompt_eval(slot); } llama_token_data_array cur_p = { slot.ctx_sampling->cur.data(), slot.ctx_sampling->cur.size(), false }; @@ -1871,6 +1922,7 @@ struct llama_server_context slot.release(); slot.print_timings(); send_final_response(slot); + metrics.on_prediction(slot); } slot.i_batch = -1; @@ -1955,6 +2007,7 @@ static void server_print_usage(const char *argv0, const gpt_params ¶ms, printf(" --mmproj MMPROJ_FILE path to a multimodal projector file for LLaVA.\n"); printf(" --log-disable disables logging to a file.\n"); printf(" --slots-endpoint-disable disables slots monitoring endpoint.\n"); + printf(" --metrics enable prometheus compatible metrics endpoint (default: %s).\n", sparams.metrics_endpoint ? "enabled" : "disabled"); printf("\n"); printf(" -n, --n-predict maximum tokens to predict (default: %d)\n", params.n_predict); printf(" --override-kv KEY=TYPE:VALUE\n"); @@ -2414,6 +2467,10 @@ static void server_params_parse(int argc, char **argv, server_params &sparams, { sparams.slots_endpoint = false; } + else if (arg == "--metrics") + { + sparams.metrics_endpoint = true; + } else if (arg == "--chat-template") { if (++i >= argc) @@ -2621,7 +2678,7 @@ int main(int argc, char **argv) // request slots data using task queue task_server task; task.id = llama.queue_tasks.get_new_id(); - task.type = TASK_TYPE_SLOTS_DATA; + task.type = TASK_TYPE_METRICS; task.target_id = -1; llama.queue_results.add_waiting_task_id(task.id); @@ -2668,7 +2725,7 @@ int main(int argc, char **argv) // request slots data using task queue task_server task; task.id = llama.queue_tasks.get_new_id(); - task.type = TASK_TYPE_SLOTS_DATA; + task.type = TASK_TYPE_METRICS; task.target_id = -1; llama.queue_results.add_waiting_task_id(task.id); @@ -2683,6 +2740,87 @@ int main(int argc, char **argv) }); } + if (sparams.metrics_endpoint) { + svr.Get("/metrics", [&](const httplib::Request&, httplib::Response& res) { + // request slots data using task queue + task_server task; + task.id = llama.queue_tasks.get_new_id(); + task.type = TASK_TYPE_METRICS; + task.target_id = -1; + + llama.queue_results.add_waiting_task_id(task.id); + llama.queue_tasks.post(task); + + // get the result + task_result result = llama.queue_results.recv(task.id); + llama.queue_results.remove_waiting_task_id(task.id); + + json data = result.result_json; + + uint64_t n_prompt_tokens_processed = data["n_prompt_tokens_processed"]; + uint64_t t_prompt_processing = data["t_prompt_processing"]; + + uint64_t n_tokens_predicted = data["n_tokens_predicted"]; + uint64_t t_tokens_generation = data["t_tokens_generation"]; + + int32_t kv_cache_used_cells = data["kv_cache_used_cells"]; + + // metrics definition: https://prometheus.io/docs/practices/naming/#metric-names + json all_metrics_def = json { + {"counter", {{ + {"name", "prompt_tokens_total"}, + {"help", "Number of prompt tokens processed."}, + {"value", data["n_prompt_tokens_processed_total"]} + }, { + {"name", "tokens_predicted_total"}, + {"help", "Number of generation tokens processed."}, + {"value", data["n_tokens_predicted_total"]} + }}}, + {"gauge", {{ + {"name", "prompt_tokens_seconds"}, + {"help", "Average prompt throughput in tokens/s."}, + {"value", n_prompt_tokens_processed ? 1e3 / t_prompt_processing * n_prompt_tokens_processed : 0} + },{ + {"name", "predicted_tokens_seconds"}, + {"help", "Average generation throughput in tokens/s."}, + {"value", n_tokens_predicted ? 1e3 / t_tokens_generation * n_tokens_predicted : 0} + },{ + {"name", "kv_cache_usage_ratio"}, + {"help", "KV-cache usage. 1 means 100 percent usage."}, + {"value", 1. * kv_cache_used_cells / params.n_ctx} + },{ + {"name", "kv_cache_tokens"}, + {"help", "KV-cache tokens."}, + {"value", data["kv_cache_tokens_count"]} + },{ + {"name", "requests_processing"}, + {"help", "Number of request processing."}, + {"value", data["processing"]} + },{ + {"name", "requests_deferred"}, + {"help", "Number of request deferred."}, + {"value", data["deferred"]} + }}} + }; + + std::stringstream prometheus; + for (const auto& el : all_metrics_def.items()) { + const auto& type = el.key(); + const auto& metrics_def = el.value(); + for (const auto& metric_def : metrics_def) { + std::string name = metric_def["name"]; + std::string help = metric_def["help"]; + prometheus << "# HELP llamacpp:" << name << " " << help << "\n" + << "# TYPE llamacpp:" << name << " " << type << "\n" + << "llamacpp:" << name << " " << metric_def["value"] << "\n"; + } + } + + res.set_content(prometheus.str(), "text/plain; version=0.0.4"); + res.status = 200; // HTTP OK + }); + } + svr.set_logger(log_server_request); svr.set_exception_handler([](const httplib::Request &, httplib::Response &res, std::exception_ptr ep) diff --git a/examples/server/tests/features/environment.py b/examples/server/tests/features/environment.py index 13cc841017f62..09e8267476135 100644 --- a/examples/server/tests/features/environment.py +++ b/examples/server/tests/features/environment.py @@ -16,6 +16,8 @@ def before_scenario(context, scenario): def after_scenario(context, scenario): + if context.server_process is None: + return if scenario.status == "failed": if 'GITHUB_ACTIONS' in os.environ: print(f"\x1b[33;101mSCENARIO FAILED: {scenario.name} server logs:\x1b[0m\n\n") diff --git a/examples/server/tests/features/server.feature b/examples/server/tests/features/server.feature index 5f81d256a548c..0139f89d831a2 100644 --- a/examples/server/tests/features/server.feature +++ b/examples/server/tests/features/server.feature @@ -13,6 +13,7 @@ Feature: llama.cpp server And 1 slots And embeddings extraction And 32 server max tokens to predict + And prometheus compatible metrics exposed Then the server is starting Then the server is healthy @@ -25,6 +26,7 @@ Feature: llama.cpp server And max tokens to predict And a completion request with no api error Then tokens are predicted matching + And prometheus metrics are exposed Examples: Prompts | prompt | n_predict | re_content | n_predicted | diff --git a/examples/server/tests/features/steps/steps.py b/examples/server/tests/features/steps/steps.py index 9c825fdbcd7f5..051fd440c3391 100644 --- a/examples/server/tests/features/steps/steps.py +++ b/examples/server/tests/features/steps/steps.py @@ -13,6 +13,7 @@ import openai from behave import step from behave.api.async_step import async_run_until_complete +from prometheus_client import parser @step(u"a server listening on {server_fqdn}:{server_port}") @@ -34,6 +35,8 @@ def step_server_config(context, server_fqdn, server_port): context.server_api_key = None context.server_continuous_batching = False context.server_embeddings = False + context.server_metrics = False + context.server_process = None context.server_seed = None context.user_api_key = None @@ -82,6 +85,11 @@ def step_server_embeddings(context): context.server_embeddings = True +@step(u'prometheus compatible metrics exposed') +def step_server_metrics(context): + context.server_metrics = True + + @step(u"the server is starting") def step_start_server(context): start_server_background(context) @@ -424,6 +432,23 @@ def step_check_options_header_value(context, cors_header, cors_header_value): assert context.options_response.headers[cors_header] == cors_header_value +@step(u'prometheus metrics are exposed') +@async_run_until_complete +async def step_prometheus_metrics_exported(context): + async with aiohttp.ClientSession() as session: + async with await session.get(f'{context.base_url}/metrics') as metrics_response: + assert metrics_response.status == 200 + assert metrics_response.headers['Content-Type'] == "text/plain; version=0.0.4" + metrics_raw = await metrics_response.text() + metric_exported = False + for metric in parser.text_string_to_metric_families(metrics_raw): + match metric.name: + case "llamacpp:kv_cache_usage_ratio": + assert len(metric.samples) > 0 + metric_exported = True + assert metric_exported, "No metrics exported" + + async def concurrent_requests(context, f_completion, *args, **kwargs): n_prompts = len(context.prompts) if context.debug: @@ -753,6 +778,8 @@ def start_server_background(context): server_args.append('--cont-batching') if context.server_embeddings: server_args.append('--embedding') + if context.server_metrics: + server_args.append('--metrics') if context.model_alias is not None: server_args.extend(['--alias', context.model_alias]) if context.n_ctx is not None: diff --git a/examples/server/tests/requirements.txt b/examples/server/tests/requirements.txt index 3e51b12dc8207..334fa4a70ea72 100644 --- a/examples/server/tests/requirements.txt +++ b/examples/server/tests/requirements.txt @@ -1,3 +1,4 @@ aiohttp~=3.9.3 behave~=1.2.6 openai~=0.25.0 +prometheus-client~=0.20.0 diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp index 88545eb6931d0..71cc5b0b8b6de 100644 --- a/examples/server/utils.hpp +++ b/examples/server/utils.hpp @@ -50,7 +50,7 @@ enum task_type { TASK_TYPE_COMPLETION, TASK_TYPE_CANCEL, TASK_TYPE_NEXT_RESPONSE, - TASK_TYPE_SLOTS_DATA + TASK_TYPE_METRICS }; struct task_server { @@ -441,7 +441,7 @@ struct llama_server_response { { LOG_VERBOSE("queue_results.push_back", {}); queue_results.push_back(result); - condition_results.notify_one(); + condition_results.notify_all(); return; } } From 930b1780269a69948d106e2d1b838ab7661f679a Mon Sep 17 00:00:00 2001 From: Pierrick Hymbert Date: Sun, 25 Feb 2024 13:50:32 +0100 Subject: [PATCH 09/18] server: logs - unified format and --log-format option (#5700) * server: logs - always use JSON logger, add add thread_id in message, log task_id and slot_id * server : skip GH copilot requests from logging * server : change message format of server_log() * server : no need to repeat log in comment * server : log style consistency * server : fix compile warning * server : fix tests regex patterns on M2 Ultra * server: logs: PR feedback on log level * server: logs: allow to choose log format in json or plain text * server: tests: output server logs in text * server: logs switch init logs to server logs macro * server: logs ensure value json value does not raised error * server: logs reduce level VERBOSE to VERB to max 4 chars * server: logs lower case as other log messages * server: logs avoid static in general Co-authored-by: Georgi Gerganov * server: logs PR feedback: change text log format to: LEVEL [function_name] message | additional=data --------- Co-authored-by: Georgi Gerganov --- examples/server/README.md | 4 +- examples/server/server.cpp | 218 ++++++++++++++---- examples/server/tests/README.md | 1 + examples/server/tests/features/server.feature | 6 +- examples/server/tests/features/steps/steps.py | 2 + examples/server/utils.hpp | 80 ++++--- 6 files changed, 231 insertions(+), 80 deletions(-) diff --git a/examples/server/README.md b/examples/server/README.md index 2129f7fb2b463..cb3fd6054095b 100644 --- a/examples/server/README.md +++ b/examples/server/README.md @@ -39,10 +39,12 @@ see https://github.com/ggerganov/llama.cpp/issues/1437 - `--mmproj MMPROJ_FILE`: Path to a multimodal projector file for LLaVA. - `--grp-attn-n`: Set the group attention factor to extend context size through self-extend(default: 1=disabled), used together with group attention width `--grp-attn-w` - `--grp-attn-w`: Set the group attention width to extend context size through self-extend(default: 512), used together with group attention factor `--grp-attn-n` -- `-n, --n-predict`: Set the maximum tokens to predict (default: -1) +- `-n N, --n-predict N`: Set the maximum tokens to predict (default: -1) - `--slots-endpoint-disable`: To disable slots state monitoring endpoint. Slots state may contain user data, prompts included. - `--metrics`: enable prometheus `/metrics` compatible endpoint (default: disabled) - `--chat-template JINJA_TEMPLATE`: Set custom jinja chat template. This parameter accepts a string, not a file name (default: template taken from model's metadata). We only support [some pre-defined templates](https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template) +- `--log-disable`: Output logs to stdout only, default: enabled. +- `--log-format FORMAT`: Define the log output to FORMAT: json or text (default: json) ## Build diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 811495915f6dd..d970202d2b5d3 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -47,6 +47,7 @@ struct server_params }; bool server_verbose = false; +bool server_log_json = true; static size_t common_part(const std::vector &a, const std::vector &b) { @@ -302,12 +303,43 @@ struct llama_client_slot } void print_timings() const { - LOG_TEE("\n"); - LOG_TEE("%s: prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n", - __func__, t_prompt_processing, num_prompt_tokens_processed, t_prompt_processing / num_prompt_tokens_processed, 1e3 / t_prompt_processing * num_prompt_tokens_processed); - LOG_TEE("%s: eval time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n", - __func__, t_token_generation, n_decoded,t_token_generation / n_decoded, 1e3 / t_token_generation * n_decoded); - LOG_TEE("%s: total time = %10.2f ms\n", __func__, t_prompt_processing + t_token_generation); + char buffer[512]; + double t_token = t_prompt_processing / num_prompt_tokens_processed; + double n_tokens_second = 1e3 / t_prompt_processing * num_prompt_tokens_processed; + sprintf(buffer, "prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)", + t_prompt_processing, num_prompt_tokens_processed, + t_token, n_tokens_second); + LOG_INFO(buffer, { + {"slot_id", id}, + {"task_id", task_id}, + {"t_prompt_processing", t_prompt_processing}, + {"num_prompt_tokens_processed", num_prompt_tokens_processed}, + {"t_token", t_token}, + {"n_tokens_second", n_tokens_second}, + }); + + t_token = t_token_generation / n_decoded; + n_tokens_second = 1e3 / t_token_generation * n_decoded; + sprintf(buffer, "generation eval time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)", + t_token_generation, n_decoded, + t_token, n_tokens_second); + LOG_INFO(buffer, { + {"slot_id", id}, + {"task_id", task_id}, + {"t_token_generation", t_token_generation}, + {"n_decoded", n_decoded}, + {"t_token", t_token}, + {"n_tokens_second", n_tokens_second}, + }); + + sprintf(buffer, " total time = %10.2f ms", t_prompt_processing + t_token_generation); + LOG_INFO(buffer, { + {"slot_id", id}, + {"task_id", task_id}, + {"t_prompt_processing", t_prompt_processing}, + {"t_token_generation", t_token_generation}, + {"t_total", t_prompt_processing + t_token_generation}, + }); } }; @@ -399,7 +431,7 @@ struct llama_server_context params = params_; if (!params.mmproj.empty()) { multimodal = true; - LOG_TEE("Multi Modal Mode Enabled"); + LOG_INFO("Multi Modal Mode Enabled", {}); clp_ctx = clip_model_load(params.mmproj.c_str(), /*verbosity=*/ 1); if(clp_ctx == nullptr) { LOG_ERROR("unable to load clip model", {{"model", params.mmproj}}); @@ -452,7 +484,7 @@ struct llama_server_context const int32_t n_ctx_slot = n_ctx / params.n_parallel; - LOG_TEE("Available slots:\n"); + LOG_INFO("initializing slots", {{"n_slots", params.n_parallel}}); for (int i = 0; i < params.n_parallel; i++) { llama_client_slot slot; @@ -461,7 +493,10 @@ struct llama_server_context slot.n_ctx = n_ctx_slot; slot.n_predict = params.n_predict; - LOG_TEE(" -> Slot %i - max context: %i\n", slot.id, n_ctx_slot); + LOG_INFO("new slot", { + {"slot_id", slot.id}, + {"n_ctx_slot", slot.n_ctx} + }); const int ga_n = params.grp_attn_n; const int ga_w = params.grp_attn_w; @@ -471,7 +506,12 @@ struct llama_server_context GGML_ASSERT(ga_w % ga_n == 0 && "ga_w must be a multiple of ga_n"); // NOLINT //GGML_ASSERT(n_ctx_train % ga_w == 0 && "n_ctx_train must be a multiple of ga_w"); // NOLINT //GGML_ASSERT(n_ctx >= n_ctx_train * ga_n && "n_ctx must be at least n_ctx_train * ga_n"); // NOLINT - LOG_TEE(" -> Slot %i - self-extend: ga_n = %d, ga_w = %d\n", slot.id, ga_n, ga_w); + + LOG_INFO("slot self-extend", { + {"slot_id", slot.id}, + {"ga_n", ga_n}, + {"ga_w", ga_w} + }); } slot.ga_i = 0; @@ -765,10 +805,16 @@ struct llama_server_context img_sl.img_data = clip_image_u8_init(); if (!clip_image_load_from_bytes(image_buffer.data(), image_buffer.size(), img_sl.img_data)) { - LOG_TEE("slot %i - failed to load image [id: %i]\n", slot->id, img_sl.id); + LOG_ERROR("failed to load image", { + {"slot_id", slot->id}, + {"img_sl_id", img_sl.id} + }); return false; } - LOG_TEE("slot %i - loaded image\n", slot->id); + LOG_VERBOSE("image loaded", { + {"slot_id", slot->id}, + {"img_sl_id", img_sl.id} + }); img_sl.request_encode_image = true; slot->images.push_back(img_sl); } @@ -828,7 +874,10 @@ struct llama_server_context all_slots_are_idle = false; - LOG_TEE("slot %i is processing [task id: %i]\n", slot->id, slot->task_id); + LOG_INFO("slot is processing task", { + {"slot_id", slot->id}, + {"task_id", slot->task_id}, + }); return true; } @@ -1391,7 +1440,7 @@ struct llama_server_context if (slot == nullptr) { // if no slot is available, we defer this task for processing later - LOG_VERBOSE("no slot is available", {}); + LOG_VERBOSE("no slot is available", {{"task_id", task.id}}); queue_tasks.defer(task); break; } @@ -1467,7 +1516,17 @@ struct llama_server_context } slots_data.push_back(slot_data); } - LOG_TEE("task %i - slots data: idle=%i processing=%i\n", task.id, n_idle_slots, n_processing_slots); + LOG_INFO("slot data", { + {"task_id", task.id}, + {"n_idle_slots", n_idle_slots}, + {"n_processing_slots", n_processing_slots} + }); + LOG_VERBOSE("slot data", { + {"task_id", task.id}, + {"n_idle_slots", n_idle_slots}, + {"n_processing_slots", n_processing_slots}, + {"slots", slots_data} + }); task_result res; res.id = task.id; res.multitask_id = task.multitask_id; @@ -1519,7 +1578,7 @@ struct llama_server_context bool update_slots() { if (system_need_update) { - LOG_TEE("updating system prompt\n"); + LOG_INFO("updating system prompt", {}); update_system_prompt(); } @@ -1529,12 +1588,13 @@ struct llama_server_context { if (system_prompt.empty() && clean_kv_cache) { - LOG_TEE("all slots are idle and system prompt is empty, clear the KV cache\n"); + LOG_INFO("all slots are idle and system prompt is empty, clear the KV cache", {}); kv_cache_clear(); } return true; } + LOG_VERBOSE("posting NEXT_RESPONSE", {}); task_server task; task.type = TASK_TYPE_NEXT_RESPONSE; task.target_id = -1; @@ -1548,10 +1608,20 @@ struct llama_server_context { // Shift context const int n_keep = slot.params.n_keep + add_bos_token; - const int n_left = system_tokens.size() + slot.n_past - n_keep; + const int n_left = (int) system_tokens.size() + slot.n_past - n_keep; const int n_discard = n_left / 2; - LOG_TEE("slot %d: context shift - n_keep = %d, n_left = %d, n_discard = %d\n", slot.id, n_keep, n_left, n_discard); + LOG_INFO("slot context shift", { + {"slot_id", slot.id}, + {"task_id", slot.task_id}, + {"n_keep", n_keep}, + {"n_left", n_left}, + {"n_discard", n_discard}, + {"n_ctx", n_ctx}, + {"n_past", slot.n_past}, + {"n_system_tokens", system_tokens.size()}, + {"n_cache_tokens", slot.cache_tokens.size()} + }); llama_kv_cache_seq_rm (ctx, slot.id, n_keep , n_keep + n_discard); llama_kv_cache_seq_shift(ctx, slot.id, n_keep + n_discard, system_tokens.size() + slot.n_past, -n_discard); @@ -1565,17 +1635,12 @@ struct llama_server_context slot.n_past -= n_discard; slot.truncated = true; - - LOG_VERBOSE("context shift", { - { "n_ctx", n_ctx }, - { "n_keep", n_keep }, - { "n_left", n_left }, - }); } } } // decode any currently ongoing sequences + LOG_VERBOSE("decoding ongoing sequences", {}); for (auto & slot : slots) { // release the slot @@ -1585,7 +1650,15 @@ struct llama_server_context slot.command = NONE; slot.t_last_used = ggml_time_us(); - LOG_TEE("slot %d released (%d tokens in cache)\n", slot.id, (int) slot.cache_tokens.size()); + LOG_INFO("slot released", { + {"slot_id", slot.id}, + {"task_id", slot.task_id}, + {"n_ctx", n_ctx}, + {"n_past", slot.n_past}, + {"n_system_tokens", system_tokens.size()}, + {"n_cache_tokens", slot.cache_tokens.size()}, + {"truncated", slot.truncated} + }); queue_tasks.notify_slot_changed(); continue; @@ -1733,7 +1806,12 @@ struct llama_server_context slot.ga_i = ga_i; } - LOG_TEE("slot %d : in cache: %i tokens | to process: %i tokens\n", slot.id, slot.n_past, slot.num_prompt_tokens_processed); + LOG_INFO("slot progression", { + { "slot_id", slot.id }, + { "task_id", slot.task_id }, + { "n_past", slot.n_past }, + { "num_prompt_tokens_processed", slot.num_prompt_tokens_processed } + }); } slot.cache_tokens = prompt_tokens; @@ -1741,7 +1819,10 @@ struct llama_server_context if (slot.n_past == slot.num_prompt_tokens && slot.n_past > 0) { // we have to evaluate at least 1 token to generate logits. - LOG_TEE("slot %d : we have to evaluate at least 1 token to generate logits\n", slot.id); + LOG_INFO("we have to evaluate at least 1 token to generate logits", { + { "slot_id", slot.id }, + { "task_id", slot.task_id } + }); slot.n_past--; if (slot.ga_i > 0) { @@ -1749,9 +1830,13 @@ struct llama_server_context } } - LOG_TEE("slot %d : kv cache rm - [%d, end)\n", slot.id, (int) system_tokens.size() + slot.n_past); - - llama_kv_cache_seq_rm(ctx, slot.id, system_tokens.size() + slot.n_past, -1); + int p0 = (int) system_tokens.size() + slot.n_past; + LOG_INFO("kv cache rm [p0, end)", { + { "slot_id", slot.id }, + { "task_id", slot.task_id }, + { "p0", p0 } + }); + llama_kv_cache_seq_rm(ctx, slot.id, p0, -1); LOG_VERBOSE("prompt ingested", { {"n_past", slot.n_past}, @@ -1786,7 +1871,13 @@ struct llama_server_context if (has_images && !ingest_images(slot, n_batch)) { - LOG_TEE("failed processing images\n"); + LOG_ERROR("failed processing images", { + "slot_id", slot.id, + "task_id", slot.task_id, + }); + // FIXME @phymbert: to be properly tested + // early returning without changing the slot state will block the slot for ever + // no one at the moment is checking the return value return false; } @@ -1928,6 +2019,8 @@ struct llama_server_context slot.i_batch = -1; } } + + LOG_VERBOSE("slots updated", {}); return true; } @@ -2005,6 +2098,7 @@ static void server_print_usage(const char *argv0, const gpt_params ¶ms, printf(" -ctv TYPE, --cache-type-v TYPE\n"); printf(" KV cache data type for V (default: f16)\n"); printf(" --mmproj MMPROJ_FILE path to a multimodal projector file for LLaVA.\n"); + printf(" --log-format log output format: json or text (default: json)\n"); printf(" --log-disable disables logging to a file.\n"); printf(" --slots-endpoint-disable disables slots monitoring endpoint.\n"); printf(" --metrics enable prometheus compatible metrics endpoint (default: %s).\n", sparams.metrics_endpoint ? "enabled" : "disabled"); @@ -2458,6 +2552,27 @@ static void server_params_parse(int argc, char **argv, server_params &sparams, } params.mmproj = argv[i]; } + else if (arg == "--log-format") + { + if (++i >= argc) + { + invalid_param = true; + break; + } + if (std::strcmp(argv[i], "json") == 0) + { + server_log_json = true; + } + else if (std::strcmp(argv[i], "text") == 0) + { + server_log_json = false; + } + else + { + invalid_param = true; + break; + } + } else if (arg == "--log-disable") { log_set_target(stdout); @@ -2571,32 +2686,40 @@ static json format_partial_response( static json format_tokenizer_response(const std::vector &tokens) { - return json{ - {"tokens", tokens}}; + return json { + {"tokens", tokens} + }; } static json format_detokenized_response(std::string content) { - return json{ - {"content", content}}; + return json { + {"content", content} + }; } static void log_server_request(const httplib::Request &req, const httplib::Response &res) { + // skip GH copilot requests when using default port + if (req.path == "/v1/health" || req.path == "/v1/completions") + { + return; + } + LOG_INFO("request", { - {"remote_addr", req.remote_addr}, - {"remote_port", req.remote_port}, - {"status", res.status}, - {"method", req.method}, - {"path", req.path}, - {"params", req.params}, - }); + {"remote_addr", req.remote_addr}, + {"remote_port", req.remote_port}, + {"status", res.status}, + {"method", req.method}, + {"path", req.path}, + {"params", req.params}, + }); LOG_VERBOSE("request", { - {"request", req.body}, - {"response", res.body}, - }); + {"request", req.body}, + {"response", res.body}, + }); } struct token_translator @@ -2873,9 +2996,6 @@ int main(int argc, char **argv) // Set the base directory for serving static files svr.set_base_dir(sparams.public_path); - // to make it ctrl+clickable: - LOG_TEE("\nllama server listening at http://%s:%d\n\n", sparams.hostname.c_str(), sparams.port); - std::unordered_map log_data; log_data["hostname"] = sparams.hostname; log_data["port"] = std::to_string(sparams.port); diff --git a/examples/server/tests/README.md b/examples/server/tests/README.md index e44c5c286601f..0b9fdc4e72678 100644 --- a/examples/server/tests/README.md +++ b/examples/server/tests/README.md @@ -32,6 +32,7 @@ It's possible to override some scenario steps values with environment variables: - `PORT` -> `context.server_port` to set the listening port of the server during scenario, default: `8080` - `LLAMA_SERVER_BIN_PATH` -> to change the server binary path, default: `../../../build/bin/server` - `DEBUG` -> "ON" to enable steps and server verbose mode `--verbose` + - `SERVER_LOG_FORMAT_JSON` -> if set switch server logs to json format ### Run @bug, @wip or @wrong_usage annotated scenario diff --git a/examples/server/tests/features/server.feature b/examples/server/tests/features/server.feature index 0139f89d831a2..b571582a7857e 100644 --- a/examples/server/tests/features/server.feature +++ b/examples/server/tests/features/server.feature @@ -29,9 +29,9 @@ Feature: llama.cpp server And prometheus metrics are exposed Examples: Prompts - | prompt | n_predict | re_content | n_predicted | - | I believe the meaning of life is | 8 | read | 8 | - | Write a joke about AI | 64 | (parkfriendsscared)+ | 32 | + | prompt | n_predict | re_content | n_predicted | + | I believe the meaning of life is | 8 | (readgoing)+ | 8 | + | Write a joke about AI | 64 | (parkfriendsscaredalways)+ | 32 | Scenario Outline: OAI Compatibility Given a model diff --git a/examples/server/tests/features/steps/steps.py b/examples/server/tests/features/steps/steps.py index 051fd440c3391..8e4babf204f8a 100644 --- a/examples/server/tests/features/steps/steps.py +++ b/examples/server/tests/features/steps/steps.py @@ -792,6 +792,8 @@ def start_server_background(context): server_args.extend(['--api-key', context.server_api_key]) if context.debug: server_args.append('--verbose') + if 'SERVER_LOG_FORMAT_JSON' not in os.environ: + server_args.extend(['--log-format', "text"]) print(f"starting server with: {context.server_path}", *server_args) context.server_process = subprocess.Popen( [str(arg) for arg in [context.server_path, *server_args]], diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp index 71cc5b0b8b6de..d7abd7cbba71c 100644 --- a/examples/server/utils.hpp +++ b/examples/server/utils.hpp @@ -14,6 +14,7 @@ using json = nlohmann::json; extern bool server_verbose; +extern bool server_log_json; #ifndef SERVER_VERBOSE #define SERVER_VERBOSE 1 @@ -27,14 +28,14 @@ extern bool server_verbose; { \ if (server_verbose) \ { \ - server_log("VERBOSE", __func__, __LINE__, MSG, __VA_ARGS__); \ + server_log("VERB", __func__, __LINE__, MSG, __VA_ARGS__); \ } \ } while (0) #endif -#define LOG_ERROR( MSG, ...) server_log("ERROR", __func__, __LINE__, MSG, __VA_ARGS__) -#define LOG_WARNING(MSG, ...) server_log("WARNING", __func__, __LINE__, MSG, __VA_ARGS__) -#define LOG_INFO( MSG, ...) server_log("INFO", __func__, __LINE__, MSG, __VA_ARGS__) +#define LOG_ERROR( MSG, ...) server_log("ERR", __func__, __LINE__, MSG, __VA_ARGS__) +#define LOG_WARNING(MSG, ...) server_log("WARN", __func__, __LINE__, MSG, __VA_ARGS__) +#define LOG_INFO( MSG, ...) server_log("INFO", __func__, __LINE__, MSG, __VA_ARGS__) // // parallel @@ -133,26 +134,48 @@ struct completion_token_output std::string text_to_send; }; -static inline void server_log(const char *level, const char *function, int line, - const char *message, const nlohmann::ordered_json &extra) +static inline void server_log(const char *level, const char *function, int line, const char *message, const nlohmann::ordered_json &extra) { - nlohmann::ordered_json log - { + std::stringstream ss_tid; + ss_tid << std::this_thread::get_id(); + json log = nlohmann::ordered_json{ + {"tid", ss_tid.str()}, {"timestamp", time(nullptr)}, - {"level", level}, - {"function", function}, - {"line", line}, - {"message", message}, }; - if (!extra.empty()) - { - log.merge_patch(extra); - } + if (server_log_json) { + log.merge_patch( + { + {"level", level}, + {"function", function}, + {"line", line}, + {"msg", message}, + }); + if (!extra.empty()) { + log.merge_patch(extra); + } + + std::cout << log.dump(-1, ' ', false, json::error_handler_t::replace) << "\n" << std::flush; + } else { + char buf[1024]; + snprintf(buf, 1024, "%4s [%24s] %s", level, function, message); - const std::string str = log.dump(-1, ' ', false, json::error_handler_t::replace); - printf("%.*s\n", (int)str.size(), str.data()); - fflush(stdout); + if (!extra.empty()) { + log.merge_patch(extra); + } + std::stringstream ss; + ss << buf << " |"; + for (const auto& el : log.items()) + { + const std::string value = el.value().dump(-1, ' ', false, json::error_handler_t::replace); + snprintf(buf, 1024, " %s=%s", el.key().c_str(), value.c_str()); + ss << buf; + } + + const std::string str = ss.str(); + printf("%.*s\n", (int)str.size(), str.data()); + fflush(stdout); + } } // @@ -234,6 +257,7 @@ struct llama_server_queue { std::unique_lock lock(mutex_tasks); if (task.id == -1) { task.id = id++; + LOG_VERBOSE("new task id", {{"new_id", task.id}}); } queue_tasks.push_back(std::move(task)); condition_tasks.notify_one(); @@ -249,7 +273,9 @@ struct llama_server_queue { // Get the next id for creating anew task int get_new_id() { std::unique_lock lock(mutex_tasks); - return id++; + int new_id = id++; + LOG_VERBOSE("new task id", {{"new_id", new_id}}); + return new_id; } // Register function to process a new task @@ -290,8 +316,7 @@ struct llama_server_queue { void start_loop() { running = true; while (true) { - // new task arrived - LOG_VERBOSE("have new task", {}); + LOG_VERBOSE("new task may arrive", {}); { while (true) { @@ -303,7 +328,7 @@ struct llama_server_queue { task_server task = queue_tasks.front(); queue_tasks.erase(queue_tasks.begin()); lock.unlock(); - LOG_VERBOSE("callback_new_task", {}); + LOG_VERBOSE("callback_new_task", {{"task_id", task.id}}); callback_new_task(task); } LOG_VERBOSE("callback_all_task_finished", {}); @@ -384,11 +409,13 @@ struct llama_server_response { std::condition_variable condition_results; void add_waiting_task_id(int task_id) { + LOG_VERBOSE("waiting for task id", {{"task_id", task_id}}); std::unique_lock lock(mutex_results); waiting_task_ids.insert(task_id); } void remove_waiting_task_id(int task_id) { + LOG_VERBOSE("remove waiting for task id", {{"task_id", task_id}}); std::unique_lock lock(mutex_results); waiting_task_ids.erase(task_id); } @@ -401,7 +428,6 @@ struct llama_server_response { condition_results.wait(lock, [&]{ return !queue_results.empty(); }); - LOG_VERBOSE("condition_results unblock", {}); for (int i = 0; i < (int) queue_results.size(); i++) { @@ -426,20 +452,20 @@ struct llama_server_response { // Send a new result to a waiting task_id void send(task_result result) { std::unique_lock lock(mutex_results); - LOG_VERBOSE("send new result", {}); + LOG_VERBOSE("send new result", {{"task_id", result.id}}); for (auto& task_id : waiting_task_ids) { // LOG_TEE("waiting task id %i \n", task_id); // for now, tasks that have associated parent multitasks just get erased once multitask picks up the result if (result.multitask_id == task_id) { - LOG_VERBOSE("callback_update_multitask", {}); + LOG_VERBOSE("callback_update_multitask", {{"task_id", task_id}}); callback_update_multitask(task_id, result.id, result); continue; } if (result.id == task_id) { - LOG_VERBOSE("queue_results.push_back", {}); + LOG_VERBOSE("queue_results.push_back", {{"task_id", task_id}}); queue_results.push_back(result); condition_results.notify_all(); return; From b5ba6c9ece593be5863549798442b84c6e3a62ef Mon Sep 17 00:00:00 2001 From: Concedo <39025047+LostRuins@users.noreply.github.com> Date: Sun, 25 Feb 2024 21:14:53 +0800 Subject: [PATCH 10/18] test to see if Ofast for ggml library plus batching adjustments fixes speed regression for ggmlv1 models --- Makefile | 51 +++++++++++++++++++++++---------------------- gpttype_adapter.cpp | 18 ++++++++-------- 2 files changed, 35 insertions(+), 34 deletions(-) diff --git a/Makefile b/Makefile index ff7c533ba2b2c..9a731e36cdec0 100644 --- a/Makefile +++ b/Makefile @@ -42,6 +42,7 @@ endif CFLAGS = -I. -I./include -I./include/CL -I./otherarch -I./otherarch/tools -I./include/vulkan -O3 -DNDEBUG -std=c11 -fPIC -DLOG_DISABLE_LOGS -D_GNU_SOURCE CXXFLAGS = -I. -I./common -I./include -I./include/CL -I./otherarch -I./otherarch/tools -I./include/vulkan -O3 -DNDEBUG -std=c++11 -fPIC -DLOG_DISABLE_LOGS -D_GNU_SOURCE LDFLAGS = +FASTCFLAGS = $(subst -O3,-Ofast,$(CFLAGS)) # these are used on windows, to build some libraries with extra old device compatibility SIMPLECFLAGS = @@ -380,23 +381,23 @@ $(info ) # ggml.o: ggml.c ggml.h ggml-cuda.h - $(CC) $(CFLAGS) $(FULLCFLAGS) -c $< -o $@ + $(CC) $(FASTCFLAGS) $(FULLCFLAGS) -c $< -o $@ ggml_v4_openblas.o: ggml.c ggml.h ggml-cuda.h - $(CC) $(CFLAGS) $(FULLCFLAGS) $(OPENBLAS_FLAGS) -c $< -o $@ + $(CC) $(FASTCFLAGS) $(FULLCFLAGS) $(OPENBLAS_FLAGS) -c $< -o $@ ggml_v4_failsafe.o: ggml.c ggml.h ggml-cuda.h - $(CC) $(CFLAGS) $(NONECFLAGS) -c $< -o $@ + $(CC) $(FASTCFLAGS) $(NONECFLAGS) -c $< -o $@ ggml_v4_noavx2.o: ggml.c ggml.h ggml-cuda.h - $(CC) $(CFLAGS) $(SIMPLECFLAGS) -c $< -o $@ + $(CC) $(FASTCFLAGS) $(SIMPLECFLAGS) -c $< -o $@ ggml_v4_clblast.o: ggml.c ggml.h ggml-cuda.h - $(CC) $(CFLAGS) $(FULLCFLAGS) $(CLBLAST_FLAGS) -c $< -o $@ + $(CC) $(FASTCFLAGS) $(FULLCFLAGS) $(CLBLAST_FLAGS) -c $< -o $@ ggml_v4_cublas.o: ggml.c ggml.h ggml-cuda.h - $(CC) $(CFLAGS) $(FULLCFLAGS) $(CUBLAS_FLAGS) $(HIPFLAGS) -c $< -o $@ + $(CC) $(FASTCFLAGS) $(FULLCFLAGS) $(CUBLAS_FLAGS) $(HIPFLAGS) -c $< -o $@ ggml_v4_clblast_noavx2.o: ggml.c ggml.h ggml-cuda.h - $(CC) $(CFLAGS) $(SIMPLECFLAGS) $(CLBLAST_FLAGS) -c $< -o $@ + $(CC) $(FASTCFLAGS) $(SIMPLECFLAGS) $(CLBLAST_FLAGS) -c $< -o $@ ggml_v4_vulkan.o: ggml.c ggml.h ggml-cuda.h - $(CC) $(CFLAGS) $(FULLCFLAGS) $(VULKAN_FLAGS) -c $< -o $@ + $(CC) $(FASTCFLAGS) $(FULLCFLAGS) $(VULKAN_FLAGS) -c $< -o $@ ggml_v4_vulkan_noavx2.o: ggml.c ggml.h ggml-cuda.h - $(CC) $(CFLAGS) $(SIMPLECFLAGS) $(VULKAN_FLAGS) -c $< -o $@ + $(CC) $(FASTCFLAGS) $(SIMPLECFLAGS) $(VULKAN_FLAGS) -c $< -o $@ #quants ggml-quants.o: ggml-quants.c ggml.h ggml-quants.h ggml-cuda.h @@ -415,41 +416,41 @@ ggml-backend.o: ggml-backend.c ggml.h ggml-backend.h #version 3 libs ggml_v3.o: otherarch/ggml_v3.c otherarch/ggml_v3.h - $(CC) $(CFLAGS) $(FULLCFLAGS) -c $< -o $@ + $(CC) $(FASTCFLAGS) $(FULLCFLAGS) -c $< -o $@ ggml_v3_openblas.o: otherarch/ggml_v3.c otherarch/ggml_v3.h - $(CC) $(CFLAGS) $(FULLCFLAGS) $(OPENBLAS_FLAGS) -c $< -o $@ + $(CC) $(FASTCFLAGS) $(FULLCFLAGS) $(OPENBLAS_FLAGS) -c $< -o $@ ggml_v3_failsafe.o: otherarch/ggml_v3.c otherarch/ggml_v3.h - $(CC) $(CFLAGS) $(NONECFLAGS) -c $< -o $@ + $(CC) $(FASTCFLAGS) $(NONECFLAGS) -c $< -o $@ ggml_v3_noavx2.o: otherarch/ggml_v3.c otherarch/ggml_v3.h - $(CC) $(CFLAGS) $(SIMPLECFLAGS) -c $< -o $@ + $(CC) $(FASTCFLAGS) $(SIMPLECFLAGS) -c $< -o $@ ggml_v3_clblast.o: otherarch/ggml_v3.c otherarch/ggml_v3.h - $(CC) $(CFLAGS) $(FULLCFLAGS) $(CLBLAST_FLAGS) -c $< -o $@ + $(CC) $(FASTCFLAGS) $(FULLCFLAGS) $(CLBLAST_FLAGS) -c $< -o $@ ggml_v3_cublas.o: otherarch/ggml_v3.c otherarch/ggml_v3.h - $(CC) $(CFLAGS) $(FULLCFLAGS) $(CUBLAS_FLAGS) $(HIPFLAGS) -c $< -o $@ + $(CC) $(FASTCFLAGS) $(FULLCFLAGS) $(CUBLAS_FLAGS) $(HIPFLAGS) -c $< -o $@ ggml_v3_clblast_noavx2.o: otherarch/ggml_v3.c otherarch/ggml_v3.h - $(CC) $(CFLAGS) $(SIMPLECFLAGS) $(CLBLAST_FLAGS) -c $< -o $@ + $(CC) $(FASTCFLAGS) $(SIMPLECFLAGS) $(CLBLAST_FLAGS) -c $< -o $@ #version 2 libs ggml_v2.o: otherarch/ggml_v2.c otherarch/ggml_v2.h - $(CC) $(CFLAGS) $(FULLCFLAGS) -c $< -o $@ + $(CC) $(FASTCFLAGS) $(FULLCFLAGS) -c $< -o $@ ggml_v2_openblas.o: otherarch/ggml_v2.c otherarch/ggml_v2.h - $(CC) $(CFLAGS) $(FULLCFLAGS) $(OPENBLAS_FLAGS) -c $< -o $@ + $(CC) $(FASTCFLAGS) $(FULLCFLAGS) $(OPENBLAS_FLAGS) -c $< -o $@ ggml_v2_failsafe.o: otherarch/ggml_v2.c otherarch/ggml_v2.h - $(CC) $(CFLAGS) $(NONECFLAGS) -c $< -o $@ + $(CC) $(FASTCFLAGS) $(NONECFLAGS) -c $< -o $@ ggml_v2_noavx2.o: otherarch/ggml_v2.c otherarch/ggml_v2.h - $(CC) $(CFLAGS) $(SIMPLECFLAGS) -c $< -o $@ + $(CC) $(FASTCFLAGS) $(SIMPLECFLAGS) -c $< -o $@ ggml_v2_clblast.o: otherarch/ggml_v2.c otherarch/ggml_v2.h - $(CC) $(CFLAGS) $(FULLCFLAGS) $(CLBLAST_FLAGS) -c $< -o $@ + $(CC) $(FASTCFLAGS) $(FULLCFLAGS) $(CLBLAST_FLAGS) -c $< -o $@ ggml_v2_cublas.o: otherarch/ggml_v2.c otherarch/ggml_v2.h - $(CC) $(CFLAGS) $(FULLCFLAGS) $(CUBLAS_FLAGS) $(HIPFLAGS) -c $< -o $@ + $(CC) $(FASTCFLAGS) $(FULLCFLAGS) $(CUBLAS_FLAGS) $(HIPFLAGS) -c $< -o $@ ggml_v2_clblast_noavx2.o: otherarch/ggml_v2.c otherarch/ggml_v2.h - $(CC) $(CFLAGS) $(SIMPLECFLAGS) $(CLBLAST_FLAGS) -c $< -o $@ + $(CC) $(FASTCFLAGS) $(SIMPLECFLAGS) $(CLBLAST_FLAGS) -c $< -o $@ #extreme old version compat ggml_v1.o: otherarch/ggml_v1.c otherarch/ggml_v1.h - $(CC) $(CFLAGS) $(FULLCFLAGS) -c $< -o $@ + $(CC) $(FASTCFLAGS) $(FULLCFLAGS) -c $< -o $@ ggml_v1_failsafe.o: otherarch/ggml_v1.c otherarch/ggml_v1.h - $(CC) $(CFLAGS) $(NONECFLAGS) -c $< -o $@ + $(CC) $(FASTCFLAGS) $(NONECFLAGS) -c $< -o $@ #opencl ggml-opencl.o: ggml-opencl.cpp ggml-opencl.h diff --git a/gpttype_adapter.cpp b/gpttype_adapter.cpp index 95b72d69d74ac..02c7d56b00e06 100644 --- a/gpttype_adapter.cpp +++ b/gpttype_adapter.cpp @@ -683,7 +683,14 @@ void PurgeMissingTokens(llama_context * ctx, std::vector ¤t_context_t static int GetBatchSize(int desiredBlasBatchSize,FileFormat in_file_format) { - if(desiredBlasBatchSize<=0) + //check if approved to use BLAS + bool approved_format = !(file_format == FileFormat::BADFORMAT || + file_format == FileFormat::GPT2_1 || + file_format == FileFormat::GPTJ_1 || + file_format == FileFormat::GPTJ_2 || + file_format == FileFormat::RWKV_1 || + file_format==FileFormat::RWKV_2); + if(!approved_format || desiredBlasBatchSize<=0) { desiredBlasBatchSize = 16; } @@ -1684,14 +1691,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o } } - //if using BLAS and prompt is big enough, switch to single thread and use a huge batch - bool approved_format = !(file_format == FileFormat::BADFORMAT || - file_format == FileFormat::GPT2_1 || - file_format == FileFormat::GPTJ_1 || - file_format == FileFormat::GPTJ_2 || - file_format == FileFormat::RWKV_1 || - file_format==FileFormat::RWKV_2); - bool blasmode = (approved_format && embd_inp.size() >= 32 && ggml_cpu_has_blas() && kcpp_params->n_batch>=32); + bool blasmode = (embd_inp.size() >= 32 && ggml_cpu_has_blas() && kcpp_params->n_batch>=32); current_context_tokens.resize(n_past); From 7d548a1827f6fc6aece6db74c9d112da42c40d68 Mon Sep 17 00:00:00 2001 From: Ashok Gelal <401055+ashokgelal@users.noreply.github.com> Date: Sun, 25 Feb 2024 10:57:34 -0500 Subject: [PATCH 11/18] readme : add Msty to UI list (#5618) --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 3bc512af0602b..d61f9171b1b62 100644 --- a/README.md +++ b/README.md @@ -155,6 +155,7 @@ Unless otherwise noted these projects are open-source with permissive licensing: - [semperai/amica](https://github.com/semperai/amica) - [withcatai/catai](https://github.com/withcatai/catai) - [Mobile-Artificial-Intelligence/maid](https://github.com/Mobile-Artificial-Intelligence/maid) (MIT) +- [Msty](https://msty.app) (proprietary) --- From f1a98c52546d009f742bdec2154c2a314ea950a6 Mon Sep 17 00:00:00 2001 From: kwin1412 <42286931+kwin1412@users.noreply.github.com> Date: Mon, 26 Feb 2024 00:46:49 +0800 Subject: [PATCH 12/18] make : fix nvcc version is empty (#5713) fix nvcc version is empty --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index f03faf6eda0fb..068f6ed028460 100644 --- a/Makefile +++ b/Makefile @@ -597,7 +597,7 @@ $(info I CC: $(shell $(CC) --version | head -n 1)) $(info I CXX: $(shell $(CXX) --version | head -n 1)) ifdef LLAMA_CUBLAS $(info I NVCC: $(shell $(NVCC) --version | tail -n 1)) -CUDA_VERSION := $(shell nvcc --version | grep -oP 'release (\K[0-9]+\.[0-9])') +CUDA_VERSION := $(shell $(NVCC) --version | grep -oP 'release (\K[0-9]+\.[0-9])') ifeq ($(shell awk -v "v=$(CUDA_VERSION)" 'BEGIN { print (v < 11.7) }'),1) ifndef CUDA_DOCKER_ARCH ifndef CUDA_POWER_ARCH From abbabc5e51d0d4656b438aec10b7fae9479ef37d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rados=C5=82aw=20Gryta?= Date: Sun, 25 Feb 2024 19:43:00 +0100 Subject: [PATCH 13/18] ggml-quants : provide ggml_vqtbl1q_u8 for 64bit compatibility (#5711) * [ggml-quants] Provide ggml_vqtbl1q_u8 for 64bit compatibility vqtbl1q_u8 is not part of arm v7 neon library * [android-example] Remove abi filter after arm v7a fix * [github-workflows] Do not skip Android armeabi-v7a build --- .github/workflows/build.yml | 3 +- examples/llama.android/app/build.gradle.kts | 8 ++--- ggml-quants.c | 33 ++++++++++++++++++--- 3 files changed, 32 insertions(+), 12 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 03d76d45560cf..66ad85938ca16 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -669,8 +669,7 @@ jobs: run: | cd examples/llama.android - # Skip armeabi-v7a for now (https://github.com/llvm/llvm-project/issues/65820). - ./gradlew build --no-daemon -Pskip-armeabi-v7a + ./gradlew build --no-daemon # freeBSD-latest: # runs-on: macos-12 diff --git a/examples/llama.android/app/build.gradle.kts b/examples/llama.android/app/build.gradle.kts index aadbe22c91835..d42140efe8168 100644 --- a/examples/llama.android/app/build.gradle.kts +++ b/examples/llama.android/app/build.gradle.kts @@ -21,12 +21,8 @@ android { useSupportLibrary = true } ndk { - // Workaround for https://github.com/llvm/llvm-project/issues/65820 - // affecting armeabi-v7a. Skip armeabi-v7a when invoked with - // -Pskip-armeabi-v7a (e.g., ./gradlew build -Pskip-armeabi-v7a). - if (project.hasProperty("skip-armeabi-v7a")) { - abiFilters += listOf("arm64-v8a", "x86_64", "x86") - } + // Add NDK properties if wanted, e.g. + // abiFilters += listOf("arm64-v8a") } externalNativeBuild { cmake { diff --git a/ggml-quants.c b/ggml-quants.c index 5c5f2ce1b9b87..3d94d166d1b6d 100644 --- a/ggml-quants.c +++ b/ggml-quants.c @@ -462,6 +462,30 @@ inline static int8x16_t ggml_vqtbl1q_s8(int8x16_t a, uint8x16_t b) { return res; } +// NOTE: not tested +inline static int8x16_t ggml_vqtbl1q_u8(uint8x16_t a, uint8x16_t b) { + int8x16_t res; + + res[ 0] = a[b[ 0]]; + res[ 1] = a[b[ 1]]; + res[ 2] = a[b[ 2]]; + res[ 3] = a[b[ 3]]; + res[ 4] = a[b[ 4]]; + res[ 5] = a[b[ 5]]; + res[ 6] = a[b[ 6]]; + res[ 7] = a[b[ 7]]; + res[ 8] = a[b[ 8]]; + res[ 9] = a[b[ 9]]; + res[10] = a[b[10]]; + res[11] = a[b[11]]; + res[12] = a[b[12]]; + res[13] = a[b[13]]; + res[14] = a[b[14]]; + res[15] = a[b[15]]; + + return res; +} + #else #define ggml_int16x8x2_t int16x8x2_t @@ -476,6 +500,7 @@ inline static int8x16_t ggml_vqtbl1q_s8(int8x16_t a, uint8x16_t b) { #define ggml_vld1q_s8_x2 vld1q_s8_x2 #define ggml_vld1q_s8_x4 vld1q_s8_x4 #define ggml_vqtbl1q_s8 vqtbl1q_s8 +#define ggml_vqtbl1q_u8 vqtbl1q_u8 #endif @@ -9488,8 +9513,8 @@ void ggml_vec_dot_iq3_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const v qs += 16; vs.val[0] = vreinterpretq_u8_u32(vdupq_n_u32(signs[0] | (signs[1] << 16))); - vs.val[1] = vandq_u8(vqtbl1q_u8(vs.val[0], mask1.val[1]), mask2); - vs.val[0] = vandq_u8(vqtbl1q_u8(vs.val[0], mask1.val[0]), mask2); + vs.val[1] = vandq_u8(ggml_vqtbl1q_u8(vs.val[0], mask1.val[1]), mask2); + vs.val[0] = vandq_u8(ggml_vqtbl1q_u8(vs.val[0], mask1.val[0]), mask2); vs.val[0] = vceqq_u8(vs.val[0], mask2); vs.val[1] = vceqq_u8(vs.val[1], mask2); @@ -9497,8 +9522,8 @@ void ggml_vec_dot_iq3_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const v q3s.val[1] = vsubq_s8(vreinterpretq_s8_u8(veorq_u8(vs.val[1], vreinterpretq_u8_u32(aux32x4_1))), vreinterpretq_s8_u8(vs.val[1])); vs.val[0] = vreinterpretq_u8_u32(vdupq_n_u32(signs[2] | (signs[3] << 16))); - vs.val[1] = vandq_u8(vqtbl1q_u8(vs.val[0], mask1.val[1]), mask2); - vs.val[0] = vandq_u8(vqtbl1q_u8(vs.val[0], mask1.val[0]), mask2); + vs.val[1] = vandq_u8(ggml_vqtbl1q_u8(vs.val[0], mask1.val[1]), mask2); + vs.val[0] = vandq_u8(ggml_vqtbl1q_u8(vs.val[0], mask1.val[0]), mask2); vs.val[0] = vceqq_u8(vs.val[0], mask2); vs.val[1] = vceqq_u8(vs.val[1], mask2); From f7625019c51ca437a5840576d92362cfa710e4a2 Mon Sep 17 00:00:00 2001 From: compilade <113953597+compilade@users.noreply.github.com> Date: Sun, 25 Feb 2024 13:43:50 -0500 Subject: [PATCH 14/18] server : fix crash when system prompt is bigger than batch size (#5714) The system prompt is now decoded in batches. * server : fix off-by-one n_past when start of prompt matches whole cache The tokens right after the matching part would otherwise skip a pos value. --- examples/server/server.cpp | 28 +++++++++++++++++++++++++--- 1 file changed, 25 insertions(+), 3 deletions(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index d970202d2b5d3..c1eb61678c38a 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -902,10 +902,24 @@ struct llama_server_context llama_batch_add(batch, system_tokens[i], i, { 0 }, false); } - if (llama_decode(ctx, batch) != 0) + for (int32_t i = 0; i < (int32_t) batch.n_tokens; i += params.n_batch) { - LOG_TEE("%s: llama_decode() failed\n", __func__); - return; + const int32_t n_tokens = std::min(params.n_batch, (int32_t) (batch.n_tokens - i)); + llama_batch batch_view = { + n_tokens, + batch.token + i, + nullptr, + batch.pos + i, + batch.n_seq_id + i, + batch.seq_id + i, + batch.logits + i, + 0, 0, 0, // unused + }; + if (llama_decode(ctx, batch_view) != 0) + { + LOG_TEE("%s: llama_decode() failed\n", __func__); + return; + } } // assign the system KV cache to all parallel sequences @@ -1785,6 +1799,14 @@ struct llama_server_context } slot.n_past = common_part(slot.cache_tokens, prompt_tokens); + + // the last token of the cache is not in the KV cache until the next call to llama_decode + // (it was sampled, pushed into the "cache_tokens", but not yet put in the context) + if (slot.n_past > 0 && slot.n_past == (int32_t) slot.cache_tokens.size()) + { + slot.n_past -= 1; + } + slot.num_prompt_tokens_processed = slot.num_prompt_tokens - slot.n_past; if (slot.ga_n != 1) From a6ba735b07667c2f44e9414311188707671cd058 Mon Sep 17 00:00:00 2001 From: Concedo <39025047+LostRuins@users.noreply.github.com> Date: Mon, 26 Feb 2024 10:40:12 +0800 Subject: [PATCH 15/18] up version for 1.59.1 makefile changes --- koboldcpp.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/koboldcpp.py b/koboldcpp.py index a0630c9f16d08..a0b4da48d138f 100644 --- a/koboldcpp.py +++ b/koboldcpp.py @@ -475,7 +475,7 @@ def bring_terminal_to_foreground(): modelbusy = threading.Lock() requestsinqueue = 0 defaultport = 5001 -KcppVersion = "1.59" +KcppVersion = "1.59.1" showdebug = True showsamplerwarning = True showmaxctxwarning = True From d47e13c8924067d28bdc1997ede69eac4d953a04 Mon Sep 17 00:00:00 2001 From: Concedo <39025047+LostRuins@users.noreply.github.com> Date: Mon, 26 Feb 2024 10:49:02 +0800 Subject: [PATCH 16/18] fixed compile error: GGML_BACKEND_TYPE_GPU (+1 squashed commits) Squashed commits: [00ca282a] fixed compile error: LLAMA_SPLIT_MODE_ROW --- gpttype_adapter.cpp | 4 ++-- llama.cpp | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/gpttype_adapter.cpp b/gpttype_adapter.cpp index 02c7d56b00e06..edb7f02062398 100644 --- a/gpttype_adapter.cpp +++ b/gpttype_adapter.cpp @@ -970,9 +970,9 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in model_params.main_gpu = cu_parseinfo_maindevice; #if defined(GGML_USE_CUBLAS) - model_params.split_mode = (inputs.use_rowsplit?llama_split_mode::LLAMA_SPLIT_ROW:llama_split_mode::LLAMA_SPLIT_LAYER); + model_params.split_mode = (inputs.use_rowsplit?llama_split_mode::LLAMA_SPLIT_MODE_ROW:llama_split_mode::LLAMA_SPLIT_MODE_LAYER); #else - model_params.split_mode = llama_split_mode::LLAMA_SPLIT_LAYER; + model_params.split_mode = llama_split_mode::LLAMA_SPLIT_MODE_LAYER; #endif llama_ctx_params.n_batch = kcpp_params->n_batch; diff --git a/llama.cpp b/llama.cpp index ffc45f4bc1b50..cac6b6271a99e 100644 --- a/llama.cpp +++ b/llama.cpp @@ -2863,7 +2863,7 @@ struct llama_model_loader { bool shouldoffload = (layernum>=0 && clblast_offload_fallback_layers>layernum); if(shouldoffload) { - cur->backend = GGML_BACKEND_GPU; + cur->backend = GGML_BACKEND_TYPE_GPU; ggml_cl_transform_tensor(cur->data, cur); } } From 7b8591782783ec162035c66f606144f44638ef1f Mon Sep 17 00:00:00 2001 From: YellowRoseCx <80486540+YellowRoseCx@users.noreply.github.com> Date: Sun, 25 Feb 2024 21:15:57 -0600 Subject: [PATCH 17/18] add additional tooltips (#710) --- koboldcpp.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/koboldcpp.py b/koboldcpp.py index a0b4da48d138f..0dac1386f8ef1 100644 --- a/koboldcpp.py +++ b/koboldcpp.py @@ -1650,10 +1650,10 @@ def changerunmode(a,b,c): gpuname_label.grid(row=3, column=1, padx=75, sticky="W") gpuname_label.configure(text_color="#ffff00") gpu_layers_entry,gpu_layers_label = makelabelentry(hardware_tab,"GPU Layers:", gpulayers_var, 6, 50,"How many layers to offload onto the GPU.\nVRAM intensive, usage increases with model and context size.\nRequires some trial and error to find the best fit value.") - tensor_split_entry,tensor_split_label = makelabelentry(hardware_tab, "Tensor Split:", tensor_split_str_vars, 8, 80) - lowvram_box = makecheckbox(hardware_tab, "Low VRAM", lowvram_var, 4,0) - mmq_box = makecheckbox(hardware_tab, "Use QuantMatMul (mmq)", mmq_var, 4,1) - splitmode_box = makecheckbox(hardware_tab, "Row-Split", rowsplit_var, 5,0) + tensor_split_entry,tensor_split_label = makelabelentry(hardware_tab, "Tensor Split:", tensor_split_str_vars, 8, 80, tooltip='When using multiple GPUs this option controls how large tensors should be split across all GPUs.\nUses a comma-separated list of non-negative values that assigns the proportion of data that each GPU should get in order.\nFor example, "3,2" will assign 60% of the data to GPU 0 and 40% to GPU 1.') + lowvram_box = makecheckbox(hardware_tab, "Low VRAM", lowvram_var, 4,0, tooltiptxt='Select lowvram to not allocate VRAM scratch buffer/ "K" & "V" Cache.\nCan save a decent amount of VRAM, but makes fully offloaded model processing a bit slower.') + mmq_box = makecheckbox(hardware_tab, "Use QuantMatMul (mmq)", mmq_var, 4,1, tooltiptxt="Enable MMQ mode to use finetuned kernels instead of default CuBLAS/HipBLAS for prompt processing.\nRead the wiki. Speed may vary.") + splitmode_box = makecheckbox(hardware_tab, "Row-Split", rowsplit_var, 5,0, tooltiptxt="Split rows across GPUs instead of splitting layers and KV across GPUs.\nUses the main GPU for small tensors and intermediate results. Speed may vary.") # threads makelabelentry(hardware_tab, "Threads:" , threads_var, 11, 50,"How many threads to use.\nRecommended value is your CPU core count, defaults are usually OK.") From 39ae58ef0ded7fbd76afb74e21de507dbae9f686 Mon Sep 17 00:00:00 2001 From: Concedo <39025047+LostRuins@users.noreply.github.com> Date: Mon, 26 Feb 2024 11:35:58 +0800 Subject: [PATCH 18/18] fix tooltip glitch --- koboldcpp.py | 45 ++++++++++++++++++++++++--------------------- 1 file changed, 24 insertions(+), 21 deletions(-) diff --git a/koboldcpp.py b/koboldcpp.py index 0dac1386f8ef1..742805425971b 100644 --- a/koboldcpp.py +++ b/koboldcpp.py @@ -1167,6 +1167,30 @@ def show_new_gui(): gtooltip_box = None gtooltip_label = None + # trigger empty tooltip then remove it + def show_tooltip(event, tooltip_text=None): + nonlocal gtooltip_box, gtooltip_label + if not gtooltip_box and not gtooltip_label: + gtooltip_box = ctk.CTkToplevel(root) + gtooltip_box.configure(fg_color="#ffffe0") + gtooltip_box.withdraw() + gtooltip_box.overrideredirect(True) + gtooltip_label = ctk.CTkLabel(gtooltip_box, text=tooltip_text, text_color="#000000", fg_color="#ffffe0") + gtooltip_label.pack(expand=True, padx=2, pady=1) + else: + gtooltip_label.configure(text=tooltip_text) + + x, y = root.winfo_pointerxy() + gtooltip_box.wm_geometry(f"+{x + 10}+{y + 10}") + gtooltip_box.deiconify() + + def hide_tooltip(event): + nonlocal gtooltip_box + if gtooltip_box: + gtooltip_box.withdraw() + show_tooltip(None,"") #initialize tooltip objects + hide_tooltip(None) + tabs = ctk.CTkFrame(root, corner_radius = 0, width=windowwidth, height=windowheight-50) tabs.grid(row=0, stick="nsew") tabnames= ["Quick Launch", "Hardware", "Tokens", "Model", "Network"] @@ -1470,27 +1494,6 @@ def autoset_gpu_layers(filepath): #shitty algo to determine how many layers to u except Exception as ex: pass - def show_tooltip(event, tooltip_text=None): - nonlocal gtooltip_box, gtooltip_label - if not gtooltip_box: - gtooltip_box = ctk.CTkToplevel(root) - gtooltip_box.configure(fg_color="#ffffe0") - gtooltip_box.withdraw() - gtooltip_box.overrideredirect(True) - gtooltip_label = ctk.CTkLabel(gtooltip_box, text=tooltip_text, text_color="#000000", fg_color="#ffffe0") - gtooltip_label.pack(expand=True, padx=2, pady=1) - else: - gtooltip_label.configure(text=tooltip_text) - - x, y = root.winfo_pointerxy() - gtooltip_box.wm_geometry(f"+{x + 10}+{y + 10}") - gtooltip_box.deiconify() - - def hide_tooltip(event): - nonlocal gtooltip_box - if gtooltip_box: - gtooltip_box.withdraw() - def setup_backend_tooltip(parent): # backend count label with the tooltip function nl = '\n'