diff --git a/.cmake-format.yaml b/.cmake-format.yaml new file mode 100644 index 0000000000..98ab11753a --- /dev/null +++ b/.cmake-format.yaml @@ -0,0 +1,76 @@ +format: + _help_line_width: + - How wide to allow formatted cmake files + line_width: 120 + _help_tab_size: + - How many spaces to tab for indent + tab_size: 4 + _help_use_tabchars: + - If true, lines are indented using tab characters (utf-8 + - 0x09) instead of space characters (utf-8 0x20). + - In cases where the layout would require a fractional tab + - character, the behavior of the fractional indentation is + - governed by + use_tabchars: false + _help_separate_ctrl_name_with_space: + - If true, separate flow control names from their parentheses + - with a space + separate_ctrl_name_with_space: true + _help_min_prefix_chars: + - If the statement spelling length (including space and + - parenthesis) is smaller than this amount, then force reject + - nested layouts. + min_prefix_chars: 4 + _help_max_prefix_chars: + - If the statement spelling length (including space and + - parenthesis) is larger than the tab width by more than this + - amount, then force reject un-nested layouts. + max_prefix_chars: 10 + _help_max_lines_hwrap: + - If a candidate layout is wrapped horizontally but it exceeds + - this many lines, then reject the layout. + max_lines_hwrap: 2 + _help_line_ending: + - What style line endings to use in the output. + line_ending: unix + _help_command_case: + - Format command names consistently as 'lower' or 'upper' case + command_case: lower + _help_keyword_case: + - Format keywords consistently as 'lower' or 'upper' case + keyword_case: unchanged + _help_always_wrap: + - A list of command names which should always be wrapped + always_wrap: [] + _help_enable_sort: + - If true, the argument lists which are known to be sortable + - will be sorted lexicographicall + enable_sort: true + _help_autosort: + - If true, the parsers may infer whether or not an argument + - list is sortable (without annotation). + autosort: false + _help_require_valid_layout: + - By default, if cmake-format cannot successfully fit + - everything into the desired linewidth it will apply the + - last, most agressive attempt that it made. If this flag is + - True, however, cmake-format will print error, exit with non- + - zero status code, and write-out nothing + require_valid_layout: false + _help_layout_passes: + - A dictionary mapping layout nodes to a list of wrap + - decisions. See the documentation for more information. + layout_passes: {} +encode: + _help_emit_byteorder_mark: + - If true, emit the unicode byte-order mark (BOM) at the start + - of the file + emit_byteorder_mark: false + _help_input_encoding: + - Specify the encoding of the input file. Defaults to utf-8 + input_encoding: utf-8 + _help_output_encoding: + - Specify the encoding of the output file. Defaults to utf-8. + - Note that cmake only claims to support utf-8 so be careful + - when using anything else + output_encoding: utf-8 diff --git a/.github/ISSUE_TEMPLATE/config.yml b/.github/ISSUE_TEMPLATE/config.yml index 9588d36020..8c4a0a8db5 100644 --- a/.github/ISSUE_TEMPLATE/config.yml +++ b/.github/ISSUE_TEMPLATE/config.yml @@ -9,6 +9,9 @@ contact_links: - name: Chat with us on Matrix? url: https://matrix.to/#/#valkey:matrix.org about: We are on Matrix too! + - name: Chat with us on Slack? + url: https://join.slack.com/t/valkey-oss-developer/shared_invite/zt-2nxs51chx-EB9hu9Qdch3GMfRcztTSkQ + about: We are on Slack too! - name: Documentation issue? url: https://github.com/valkey-io/valkey-doc/issues about: Report it on the valkey-doc repo. diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index f1d23f40fa..df3eaa1905 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -17,7 +17,7 @@ jobs: - name: make # Fail build if there are warnings # build with TLS just for compilation coverage - run: make -j4 all-with-unit-tests SERVER_CFLAGS='-Werror' BUILD_TLS=yes + run: make -j4 all-with-unit-tests SERVER_CFLAGS='-Werror' BUILD_TLS=yes USE_FAST_FLOAT=yes - name: test run: | sudo apt-get install tcl8.6 tclx @@ -34,6 +34,31 @@ jobs: run: | ./src/valkey-unit-tests + test-ubuntu-latest-cmake: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1 + - name: cmake and make + run: | + sudo apt-get install -y cmake libssl-dev + mkdir -p build-release + cd build-release + cmake -DCMAKE_BUILD_TYPE=Release .. -DBUILD_TLS=yes -DBUILD_UNIT_TESTS=yes + make -j$(nproc) + - name: test + run: | + sudo apt-get install -y tcl8.6 tclx + ln -sf $(pwd)/build-release/bin/valkey-server $(pwd)/src/valkey-server + ln -sf $(pwd)/build-release/bin/valkey-cli $(pwd)/src/valkey-cli + ln -sf $(pwd)/build-release/bin/valkey-benchmark $(pwd)/src/valkey-benchmark + ln -sf $(pwd)/build-release/bin/valkey-server $(pwd)/src/valkey-check-aof + ln -sf $(pwd)/build-release/bin/valkey-server $(pwd)/src/valkey-check-rdb + ln -sf $(pwd)/build-release/bin/valkey-server $(pwd)/src/valkey-sentinel + ./runtest --verbose --tags -slow --dump-logs + - name: unit tests + run: | + ./build-release/bin/valkey-unit-tests + test-sanitizer-address: runs-on: ubuntu-latest steps: @@ -52,10 +77,14 @@ jobs: runs-on: ubuntu-latest steps: - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1 - - name: make + - name: prepare-development-libraries + run: sudo apt-get install librdmacm-dev libibverbs-dev + - name: make-rdma-module + run: make -j4 BUILD_RDMA=module + - name: make-rdma-builtin run: | - sudo apt-get install librdmacm-dev libibverbs-dev - make -j4 BUILD_RDMA=module + make distclean + make -j4 BUILD_RDMA=yes - name: clone-rxe-kmod run: | mkdir -p tests/rdma/rxe @@ -83,23 +112,30 @@ jobs: steps: - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1 - name: make - run: make -j3 SERVER_CFLAGS='-Werror' + # Build with additional upcoming features + run: make -j3 all-with-unit-tests SERVER_CFLAGS='-Werror' USE_FAST_FLOAT=yes build-32bit: runs-on: ubuntu-latest steps: - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1 - name: make + # Fast float requires C++ 32-bit libraries to compile on 64-bit ubuntu + # machine i.e. "-cross" suffixed version. Cross-compiling c++ to 32-bit + # also requires multilib support for g++ compiler i.e. "-multilib" + # suffixed version of g++. g++-multilib generally includes libstdc++. + # *cross version as well, but it is also added explicitly just in case. run: | - sudo apt-get update && sudo apt-get install libc6-dev-i386 - make -j4 SERVER_CFLAGS='-Werror' 32bit + sudo apt-get update + sudo apt-get install libc6-dev-i386 libstdc++-11-dev-i386-cross gcc-multilib g++-multilib + make -j4 SERVER_CFLAGS='-Werror' 32bit USE_FAST_FLOAT=yes build-libc-malloc: runs-on: ubuntu-latest steps: - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1 - name: make - run: make -j4 SERVER_CFLAGS='-Werror' MALLOC=libc + run: make -j4 SERVER_CFLAGS='-Werror' MALLOC=libc USE_FAST_FLOAT=yes build-almalinux8-jemalloc: runs-on: ubuntu-latest @@ -109,8 +145,8 @@ jobs: - name: make run: | - dnf -y install epel-release gcc make procps-ng which - make -j4 SERVER_CFLAGS='-Werror' + dnf -y install epel-release gcc gcc-c++ make procps-ng which + make -j4 SERVER_CFLAGS='-Werror' USE_FAST_FLOAT=yes format-yaml: runs-on: ubuntu-latest diff --git a/.github/workflows/daily.yml b/.github/workflows/daily.yml index bcfa35c939..c06d73440d 100644 --- a/.github/workflows/daily.yml +++ b/.github/workflows/daily.yml @@ -60,7 +60,7 @@ jobs: repository: ${{ env.GITHUB_REPOSITORY }} ref: ${{ env.GITHUB_HEAD_REF }} - name: make - run: make all-with-unit-tests SERVER_CFLAGS='-Werror -DSERVER_TEST' + run: make all-with-unit-tests SERVER_CFLAGS='-Werror' - name: testprep run: sudo apt-get install tcl8.6 tclx - name: test @@ -75,10 +75,7 @@ jobs: - name: cluster tests if: true && !contains(github.event.inputs.skiptests, 'cluster') run: ./runtest-cluster ${{github.event.inputs.cluster_test_args}} - - name: legacy unit tests - if: true && !contains(github.event.inputs.skiptests, 'unittest') - run: ./src/valkey-server test all --accurate - - name: new unit tests + - name: unittest if: true && !contains(github.event.inputs.skiptests, 'unittest') run: ./src/valkey-unit-tests --accurate @@ -89,7 +86,7 @@ jobs: (github.event_name == 'schedule' && github.repository == 'valkey-io/valkey') || (github.event_name == 'pull_request' && (contains(github.event.pull_request.labels.*.name, 'run-extra-tests') || github.event.pull_request.base.ref != 'unstable'))) && !contains(github.event.inputs.skipjobs, 'fortify') - container: ubuntu:lunar + container: ubuntu:plucky timeout-minutes: 14400 steps: - name: prep @@ -109,7 +106,7 @@ jobs: run: | apt-get update && apt-get install -y make gcc-13 update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-13 100 - make all-with-unit-tests CC=gcc OPT=-O3 SERVER_CFLAGS='-Werror -DSERVER_TEST -U_FORTIFY_SOURCE -D_FORTIFY_SOURCE=3' + make all-with-unit-tests CC=gcc OPT=-O3 SERVER_CFLAGS='-Werror -U_FORTIFY_SOURCE -D_FORTIFY_SOURCE=3' - name: testprep run: apt-get install -y tcl8.6 tclx procps - name: test @@ -124,10 +121,7 @@ jobs: - name: cluster tests if: true && !contains(github.event.inputs.skiptests, 'cluster') run: ./runtest-cluster ${{github.event.inputs.cluster_test_args}} - - name: legacy unit tests - if: true && !contains(github.event.inputs.skiptests, 'unittest') - run: ./src/valkey-server test all --accurate - - name: new unit tests + - name: unittest if: true && !contains(github.event.inputs.skiptests, 'unittest') run: ./src/valkey-unit-tests --accurate @@ -234,7 +228,7 @@ jobs: - name: make run: | sudo apt-get update && sudo apt-get install libc6-dev-i386 - make 32bit SERVER_CFLAGS='-Werror -DSERVER_TEST' + make 32bit SERVER_CFLAGS='-Werror' - name: testprep run: sudo apt-get install tcl8.6 tclx - name: test @@ -251,10 +245,7 @@ jobs: - name: cluster tests if: true && !contains(github.event.inputs.skiptests, 'cluster') run: ./runtest-cluster ${{github.event.inputs.cluster_test_args}} - - name: legacy unit tests - if: true && !contains(github.event.inputs.skiptests, 'unittest') - run: ./src/valkey-server test all --accurate - - name: new unit tests + - name: unittest if: true && !contains(github.event.inputs.skiptests, 'unittest') run: ./src/valkey-unit-tests --accurate @@ -328,7 +319,7 @@ jobs: ref: ${{ env.GITHUB_HEAD_REF }} - name: make run: | - make BUILD_TLS=yes SERVER_CFLAGS='-Werror' + make BUILD_TLS=yes SERVER_CFLAGS='-Werror' USE_FAST_FLOAT=yes - name: testprep run: | sudo apt-get install tcl8.6 tclx tcl-tls @@ -483,7 +474,7 @@ jobs: repository: ${{ env.GITHUB_REPOSITORY }} ref: ${{ env.GITHUB_HEAD_REF }} - name: make - run: make valgrind SERVER_CFLAGS='-Werror -DSERVER_TEST' + run: make valgrind SERVER_CFLAGS='-Werror' - name: testprep run: | sudo apt-get update @@ -515,7 +506,7 @@ jobs: repository: ${{ env.GITHUB_REPOSITORY }} ref: ${{ env.GITHUB_HEAD_REF }} - name: make - run: make valgrind SERVER_CFLAGS='-Werror -DSERVER_TEST' + run: make valgrind all-with-unit-tests SERVER_CFLAGS='-Werror' - name: testprep run: | sudo apt-get update @@ -526,7 +517,7 @@ jobs: - name: unittest if: true && !contains(github.event.inputs.skiptests, 'unittest') run: | - valgrind --track-origins=yes --suppressions=./src/valgrind.sup --show-reachable=no --show-possibly-lost=no --leak-check=full --log-file=err.txt ./src/valkey-server test all --valgrind + valgrind --track-origins=yes --suppressions=./src/valgrind.sup --show-reachable=no --show-possibly-lost=no --leak-check=full --log-file=err.txt ./src/valkey-unit-tests --valgrind if grep -q 0x err.txt; then cat err.txt; exit 1; fi test-valgrind-no-malloc-usable-size-test: @@ -552,7 +543,7 @@ jobs: repository: ${{ env.GITHUB_REPOSITORY }} ref: ${{ env.GITHUB_HEAD_REF }} - name: make - run: make valgrind CFLAGS="-DNO_MALLOC_USABLE_SIZE -DSERVER_TEST" SERVER_CFLAGS='-Werror' + run: make valgrind CFLAGS="-DNO_MALLOC_USABLE_SIZE" SERVER_CFLAGS='-Werror' - name: testprep run: | sudo apt-get update @@ -584,7 +575,7 @@ jobs: repository: ${{ env.GITHUB_REPOSITORY }} ref: ${{ env.GITHUB_HEAD_REF }} - name: make - run: make valgrind CFLAGS="-DNO_MALLOC_USABLE_SIZE -DSERVER_TEST" SERVER_CFLAGS='-Werror' + run: make valgrind all-with-unit-tests CFLAGS="-DNO_MALLOC_USABLE_SIZE" SERVER_CFLAGS='-Werror' - name: testprep run: | sudo apt-get update @@ -595,7 +586,7 @@ jobs: - name: unittest if: true && !contains(github.event.inputs.skiptests, 'unittest') run: | - valgrind --track-origins=yes --suppressions=./src/valgrind.sup --show-reachable=no --show-possibly-lost=no --leak-check=full --log-file=err.txt ./src/valkey-server test all --valgrind + valgrind --track-origins=yes --suppressions=./src/valgrind.sup --show-reachable=no --show-possibly-lost=no --leak-check=full --log-file=err.txt ./src/valkey-unit-tests --valgrind if grep -q 0x err.txt; then cat err.txt; exit 1; fi test-sanitizer-address: @@ -627,7 +618,7 @@ jobs: repository: ${{ env.GITHUB_REPOSITORY }} ref: ${{ env.GITHUB_HEAD_REF }} - name: make - run: make all-with-unit-tests OPT=-O3 SANITIZER=address SERVER_CFLAGS='-DSERVER_TEST -Werror' + run: make all-with-unit-tests OPT=-O3 SANITIZER=address SERVER_CFLAGS='-Werror' - name: testprep run: | sudo apt-get update @@ -644,10 +635,7 @@ jobs: - name: cluster tests if: true && !contains(github.event.inputs.skiptests, 'cluster') run: ./runtest-cluster ${{github.event.inputs.cluster_test_args}} - - name: legacy unit tests - if: true && !contains(github.event.inputs.skiptests, 'unittest') - run: ./src/valkey-server test all - - name: new unit tests + - name: unittest if: true && !contains(github.event.inputs.skiptests, 'unittest') run: ./src/valkey-unit-tests @@ -680,7 +668,7 @@ jobs: repository: ${{ env.GITHUB_REPOSITORY }} ref: ${{ env.GITHUB_HEAD_REF }} - name: make - run: make all-with-unit-tests OPT=-O3 SANITIZER=undefined SERVER_CFLAGS='-DSERVER_TEST -Werror' LUA_DEBUG=yes # we (ab)use this flow to also check Lua C API violations + run: make all-with-unit-tests OPT=-O3 SANITIZER=undefined SERVER_CFLAGS='-Werror' LUA_DEBUG=yes # we (ab)use this flow to also check Lua C API violations - name: testprep run: | sudo apt-get update @@ -697,10 +685,7 @@ jobs: - name: cluster tests if: true && !contains(github.event.inputs.skiptests, 'cluster') run: ./runtest-cluster ${{github.event.inputs.cluster_test_args}} - - name: legacy unit tests - if: true && !contains(github.event.inputs.skiptests, 'unittest') - run: ./src/valkey-server test all --accurate - - name: new unit tests + - name: unittest if: true && !contains(github.event.inputs.skiptests, 'unittest') run: ./src/valkey-unit-tests --accurate @@ -1005,7 +990,7 @@ jobs: strategy: fail-fast: false matrix: - os: [macos-12, macos-14] + os: [macos-13, macos-14] runs-on: ${{ matrix.os }} if: | (github.event_name == 'workflow_dispatch' || @@ -1031,10 +1016,10 @@ jobs: repository: ${{ env.GITHUB_REPOSITORY }} ref: ${{ env.GITHUB_HEAD_REF }} - name: make - run: make SERVER_CFLAGS='-Werror -DSERVER_TEST' + run: make SERVER_CFLAGS='-Werror' test-freebsd: - runs-on: macos-12 + runs-on: macos-13 if: | (github.event_name == 'workflow_dispatch' || (github.event_name == 'schedule' && github.repository == 'valkey-io/valkey') || diff --git a/.gitignore b/.gitignore index e448e23f7e..d5cac316e6 100644 --- a/.gitignore +++ b/.gitignore @@ -49,3 +49,7 @@ nodes*.conf tests/cluster/tmp/* tests/rdma/rdma-test tags +build-debug/ +build-release/ +cmake-build-debug/ +cmake-build-release/ diff --git a/CMakeLists.txt b/CMakeLists.txt new file mode 100644 index 0000000000..77d0c4e7d8 --- /dev/null +++ b/CMakeLists.txt @@ -0,0 +1,43 @@ +cmake_minimum_required(VERSION 3.10) + +# Must be done first +if (APPLE) + # Force clang compiler on macOS + find_program(CLANGPP "clang++") + find_program(CLANG "clang") + if (CLANG AND CLANGPP) + message(STATUS "Found ${CLANGPP}, ${CLANG}") + set(CMAKE_CXX_COMPILER ${CLANGPP}) + set(CMAKE_C_COMPILER ${CLANG}) + endif () +endif () + +# Options +option(BUILD_UNIT_TESTS "Build valkey-unit-tests" OFF) +option(BUILD_TEST_MODULES "Build all test modules" OFF) +option(BUILD_EXAMPLE_MODULES "Build example modules" OFF) + +set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/cmake/Modules/") +project("valkey") + +set(CMAKE_C_STANDARD 11) +set(CMAKE_C_STANDARD_REQUIRED ON) +set(CMAKE_C_EXTENSIONS ON) + +include(ValkeySetup) +add_subdirectory(src) +add_subdirectory(tests) + +# Include the packaging module +include(Packaging) + +# Clear cached variables from the cache +unset(BUILD_TESTS CACHE) +unset(CLANGPP CACHE) +unset(CLANG CACHE) +unset(BUILD_RDMA_MODULE CACHE) +unset(BUILD_TLS_MODULE CACHE) +unset(BUILD_UNIT_TESTS CACHE) +unset(BUILD_TEST_MODULES CACHE) +unset(BUILD_EXAMPLE_MODULES CACHE) +unset(USE_TLS CACHE) diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md index 1c530ec7ba..36764bb81b 100644 --- a/CODE_OF_CONDUCT.md +++ b/CODE_OF_CONDUCT.md @@ -49,7 +49,7 @@ representative at an online or offline event. Enforcement Instances of abusive, harassing, or otherwise unacceptable behavior may be reported to the community leaders responsible for enforcement at -this email address: placeholderkv@gmail.com. +this email address: maintainers@lists.valkey.io. All complaints will be reviewed and investigated promptly and fairly. All community leaders are obligated to respect the privacy and security of the reporter of any incident. diff --git a/GOVERNANCE.md b/GOVERNANCE.md index 33c3887430..7fd33272cb 100644 --- a/GOVERNANCE.md +++ b/GOVERNANCE.md @@ -2,7 +2,9 @@ The Valkey project is managed by a Technical Steering Committee (TSC) composed of the maintainers of the Valkey repository. The Valkey project includes all of the current and future repositories under the Valkey-io organization. -Maintainers are defined as individuals with full commit access to a repository, which shall be in sync with the MAINTAINERS.md file in a given projects repository. +Committers are defined as individuals with write access to the code within a repository. +Maintainers are defined as individuals with full access to a repository and own its governance. +Both maintainers and committers should be clearly listed in the MAINTAINERS.md file in a given projects repository. Maintainers of other repositories within the Valkey project are not members of the TSC unless explicitly added. ## Technical Steering Committee diff --git a/MAINTAINERS.md b/MAINTAINERS.md index 635bf25067..947979eb33 100644 --- a/MAINTAINERS.md +++ b/MAINTAINERS.md @@ -16,8 +16,16 @@ Maintainers listed in alphabetical order by their github ID. | Zhao Zhao | [soloestoy](https://github.com/soloestoy) | Alibaba | | Viktor Söderqvist | [zuiderkwast](https://github.com/zuiderkwast) | Ericsson | +## Current Committers -### Former Maintainers +Committers listed in alphabetical order by their github ID. + +| Committer | GitHub ID | Affiliation | +| ------------------- | ----------------------------------------------- | ----------- | +| Harkrishn Patro | [hpatro](https://github.com/hpatro) | Amazon | +| Ran Shidlansik | [ranshid](https://github.com/ranshid) | Amazon | + +### Former Maintainers and Committers | Maintainer | GitHub ID | Affiliation | | ------------------- | ----------------------------------------------- | ----------- | \ No newline at end of file diff --git a/README.md b/README.md index 1a8ce1a4db..c447cc8d47 100644 --- a/README.md +++ b/README.md @@ -4,13 +4,12 @@ This project was forked from the open source Redis project right before the tran This README is just a fast *quick start* document. More details can be found under [valkey.io](https://valkey.io/) -What is Valkey? --------------- +# What is Valkey? + Valkey is a high-performance data structure server that primarily serves key/value workloads. It supports a wide range of native structures and an extensible plugin system for adding new data structures and access patterns. -Building Valkey --------------- +# Building Valkey using `Makefile` Valkey can be compiled and used on Linux, OSX, OpenBSD, NetBSD, FreeBSD. We support big endian and little endian architectures, and both 32 bit @@ -38,12 +37,17 @@ To build TLS as Valkey module: Note that sentinel mode does not support TLS module. To build with experimental RDMA support you'll need RDMA development libraries -(e.g. librdmacm-dev and libibverbs-dev on Debian/Ubuntu). For now, Valkey only -supports RDMA as connection module mode. Run: +(e.g. librdmacm-dev and libibverbs-dev on Debian/Ubuntu). + +To build RDMA support as Valkey built-in: + + % make BUILD_RDMA=yes + +To build RDMA as Valkey module: % make BUILD_RDMA=module -To build with systemd support, you'll need systemd development libraries (such +To build with systemd support, you'll need systemd development libraries (such as libsystemd-dev on Debian/Ubuntu or systemd-devel on CentOS) and run: % make USE_SYSTEMD=yes @@ -71,8 +75,7 @@ More about running the integration tests can be found in [tests/README.md](tests/README.md) and for unit tests, see [src/unit/README.md](src/unit/README.md). -Fixing build problems with dependencies or cached build options ---------- +## Fixing build problems with dependencies or cached build options Valkey has some dependencies which are included in the `deps` directory. `make` does not automatically rebuild dependencies even if something in @@ -91,8 +94,7 @@ optimizations (for debugging purposes), and other similar build time options, those options are cached indefinitely until you issue a `make distclean` command. -Fixing problems building 32 bit binaries ---------- +## Fixing problems building 32 bit binaries If after building Valkey with a 32 bit target you need to rebuild it with a 64 bit target, or the other way around, you need to perform a @@ -105,8 +107,7 @@ the following steps: * Try using the following command line instead of `make 32bit`: `make CFLAGS="-m32 -march=native" LDFLAGS="-m32"` -Allocator ---------- +## Allocator Selecting a non-default memory allocator when building Valkey is done by setting the `MALLOC` environment variable. Valkey is compiled and linked against libc @@ -122,28 +123,25 @@ To compile against jemalloc on Mac OS X systems, use: % make MALLOC=jemalloc -Monotonic clock ---------------- +## Monotonic clock By default, Valkey will build using the POSIX clock_gettime function as the monotonic clock source. On most modern systems, the internal processor clock -can be used to improve performance. Cautions can be found here: +can be used to improve performance. Cautions can be found here: http://oliveryang.net/2015/09/pitfalls-of-TSC-usage/ To build with support for the processor's internal instruction clock, use: % make CFLAGS="-DUSE_PROCESSOR_CLOCK" -Verbose build -------------- +## Verbose build Valkey will build with a user-friendly colorized output by default. If you want to see a more verbose output, use the following: % make V=1 -Running Valkey -------------- +# Running Valkey To run Valkey with the default configuration, just type: @@ -165,10 +163,10 @@ as options using the command line. Examples: All the options in valkey.conf are also supported as options using the command line, with exactly the same name. -Running Valkey with TLS: ------------------- +# Running Valkey with TLS: + +## Running manually -### Running manually To manually run a Valkey server with TLS mode (assuming `./gen-test-certs.sh` was invoked so sample certificates/keys are available): * TLS built-in mode: @@ -204,27 +202,33 @@ Specifying `--tls-replication yes` makes a replica connect to the primary. Using `--tls-cluster yes` makes Valkey Cluster use TLS across nodes. -Running Valkey with RDMA: ------------------- +# Running Valkey with RDMA: Note that Valkey Over RDMA is an experimental feature. It may be changed or removed in any minor or major version. Currently, it is only supported on Linux. -To manually run a Valkey server with RDMA mode: +* RDMA built-in mode: + ``` + ./src/valkey-server --protected-mode no \ + --rdma-bind 192.168.122.100 --rdma-port 6379 + ``` - % ./src/valkey-server --protected-mode no \ - --loadmodule src/valkey-rdma.so bind=192.168.122.100 port=6379 +* RDMA module mode: + ``` + ./src/valkey-server --protected-mode no \ + --loadmodule src/valkey-rdma.so --rdma-bind 192.168.122.100 --rdma-port 6379 + ``` It's possible to change bind address/port of RDMA by runtime command: - 192.168.122.100:6379> CONFIG SET rdma.port 6380 + 192.168.122.100:6379> CONFIG SET rdma-port 6380 It's also possible to have both RDMA and TCP available, and there is no conflict of TCP(6379) and RDMA(6379), Ex: % ./src/valkey-server --protected-mode no \ - --loadmodule src/valkey-rdma.so bind=192.168.122.100 port=6379 \ + --loadmodule src/valkey-rdma.so --rdma-bind 192.168.122.100 --rdma-port 6379 \ --port 6379 Note that the network card (192.168.122.100 of this example) should support @@ -236,8 +240,7 @@ Or: % ibv_devices -Playing with Valkey ------------------- +# Playing with Valkey You can use valkey-cli to play with Valkey. Start a valkey-server instance, then in another terminal try the following: @@ -256,8 +259,7 @@ then in another terminal try the following: (integer) 2 valkey> -Installing Valkey ------------------ +# Installing Valkey In order to install Valkey binaries into /usr/local/bin, just use: @@ -289,16 +291,82 @@ system reboots. You'll be able to stop and start Valkey using the script named `/etc/init.d/valkey_`, for instance `/etc/init.d/valkey_6379`. -Code contributions ------------------ +# Building using `CMake` + +In addition to the traditional `Makefile` build, Valkey supports an alternative, **experimental**, build system using `CMake`. + +To build and install `Valkey`, in `Release` mode (an optimized build), type this into your terminal: + +```bash +mkdir build-release +cd $_ +cmake .. -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=/opt/valkey +sudo make install +# Valkey is now installed under /opt/valkey +``` + +Other options supported by Valkey's `CMake` build system: + +## Special build flags + +- `-DBUILD_TLS=` enable TLS build for Valkey. Default: `no` +- `-DBUILD_RDMA=` enable RDMA module build (only module mode supported). Default: `no` +- `-DBUILD_MALLOC=` choose the allocator to use. Default on Linux: `jemalloc`, for other OS: `libc` +- `-DBUILD_SANITIZER=` build with address sanitizer enabled. Default: disabled (no sanitizer) +- `-DBUILD_UNIT_TESTS=[yes|no]` when set, the build will produce the executable `valkey-unit-tests`. Default: `no` +- `-DBUILD_TEST_MODULES=[yes|no]` when set, the build will include the modules located under the `tests/modules` folder. Default: `no` +- `-DBUILD_EXAMPLE_MODULES=[yes|no]` when set, the build will include the example modules located under the `src/modules` folder. Default: `no` + +## Common flags + +- `-DCMAKE_BUILD_TYPE=` define the build type, see CMake manual for more details +- `-DCMAKE_INSTALL_PREFIX=/installation/path` override this value to define a custom install prefix. Default: `/usr/local` +- `-G""` generate build files for "Generator Name". By default, CMake will generate `Makefile`s. + +## Verbose build + +`CMake` generates a user-friendly colorized output by default. +If you want to see a more verbose output, use the following: + +```bash +make VERBOSE=1 +``` + +## Troubleshooting + +During the `CMake` stage, `CMake` caches variables in a local file named `CMakeCache.txt`. All variables generated by Valkey +are removed from the cache once consumed (this is done by calling to `unset(VAR-NAME CACHE)`). However, some variables, +like the compiler path, are kept in cache. To start a fresh build either remove the cache file `CMakeCache.txt` from the +build folder, or delete the build folder completely. + +**It is important to re-run `CMake` when adding new source files.** + +## Integration with IDE + +During the `CMake` stage of the build, `CMake` generates a JSON file named `compile_commands.json` and places it under the +build folder. This file is used by many IDEs and text editors for providing code completion (via `clangd`). + +A small caveat is that these tools will look for `compile_commands.json` under the Valkey's top folder. +A common workaround is to create a symbolic link to it: + +```bash +cd /path/to/valkey/ +# We assume here that your build folder is `build-release` +ln -sf $(pwd)/build-release/compile_commands.json $(pwd)/compile_commands.json +``` + +Restart your IDE and voila + +# Code contributions + Please see the [CONTRIBUTING.md][2]. For security bugs and vulnerabilities, please see [SECURITY.md][3]. -[1]: https://github.com/valkey-io/valkey/blob/unstable/COPYING -[2]: https://github.com/valkey-io/valkey/blob/unstable/CONTRIBUTING.md -[3]: https://github.com/valkey-io/valkey/blob/unstable/SECURITY.md +# Valkey is an open community project under LF Projects -Valkey is an open community project under LF Projects ------------------ Valkey a Series of LF Projects, LLC 2810 N Church St, PMB 57274 Wilmington, Delaware 19802-4447 + +[1]: https://github.com/valkey-io/valkey/blob/unstable/COPYING +[2]: https://github.com/valkey-io/valkey/blob/unstable/CONTRIBUTING.md +[3]: https://github.com/valkey-io/valkey/blob/unstable/SECURITY.md diff --git a/cmake/Modules/Packaging.cmake b/cmake/Modules/Packaging.cmake new file mode 100644 index 0000000000..c7ed5c426b --- /dev/null +++ b/cmake/Modules/Packaging.cmake @@ -0,0 +1,44 @@ +set(CPACK_PACKAGE_NAME "valkey") + +valkey_parse_version(CPACK_PACKAGE_VERSION_MAJOR CPACK_PACKAGE_VERSION_MINOR CPACK_PACKAGE_VERSION_PATCH) + +set(CPACK_PACKAGE_CONTACT "maintainers@lists.valkey.io") +set(CPACK_PACKAGE_DESCRIPTION_SUMMARY "Valkey is an open source (BSD) high-performance key/value datastore") +set(CPACK_RESOURCE_FILE_LICENSE "${CMAKE_SOURCE_DIR}/COPYING") +set(CPACK_RESOURCE_FILE_README "${CMAKE_SOURCE_DIR}/README.md") +set(CPACK_STRIP_FILES TRUE) + +valkey_get_distro_name(DISTRO_NAME) +message(STATUS "Current host distro: ${DISTRO_NAME}") + +if (DISTRO_NAME MATCHES ubuntu + OR DISTRO_NAME MATCHES debian + OR DISTRO_NAME MATCHES mint) + message(STATUS "Adding target package for ${DISTRO_NAME}") + set(CPACK_PACKAGING_INSTALL_PREFIX "/opt/valkey") + # Debian related parameters + set(CPACK_DEBIAN_PACKAGE_MAINTAINER "Valkey contributors") + set(CPACK_DEBIAN_PACKAGE_SHLIBDEPS ON) + set(CPACK_DEBIAN_FILE_NAME DEB-DEFAULT) + set(CPACK_GENERATOR "DEB") +endif () + +include(CPack) +unset(DISTRO_NAME CACHE) + +# --------------------------------------------------- +# Create a helper script for creating symbolic links +# --------------------------------------------------- +write_file( + ${CMAKE_BINARY_DIR}/CreateSymlink.sh + "\ +#!/bin/bash \n\ +if [ -z \${DESTDIR} ]; then \n\ + # Script is called during 'make install' \n\ + PREFIX=${CMAKE_INSTALL_PREFIX}/bin \n\ +else \n\ + # Script is called during 'make package' \n\ + PREFIX=\${DESTDIR}${CPACK_PACKAGING_INSTALL_PREFIX}/bin \n\ +fi \n\ +cd \$PREFIX \n\ +ln -sf \$1 \$2") diff --git a/cmake/Modules/SourceFiles.cmake b/cmake/Modules/SourceFiles.cmake new file mode 100644 index 0000000000..c34ae644a2 --- /dev/null +++ b/cmake/Modules/SourceFiles.cmake @@ -0,0 +1,155 @@ +# ------------------------------------------------- +# Define the sources to be built +# ------------------------------------------------- + +# valkey-server source files +set(VALKEY_SERVER_SRCS + ${CMAKE_SOURCE_DIR}/src/threads_mngr.c + ${CMAKE_SOURCE_DIR}/src/adlist.c + ${CMAKE_SOURCE_DIR}/src/quicklist.c + ${CMAKE_SOURCE_DIR}/src/ae.c + ${CMAKE_SOURCE_DIR}/src/anet.c + ${CMAKE_SOURCE_DIR}/src/dict.c + ${CMAKE_SOURCE_DIR}/src/kvstore.c + ${CMAKE_SOURCE_DIR}/src/sds.c + ${CMAKE_SOURCE_DIR}/src/zmalloc.c + ${CMAKE_SOURCE_DIR}/src/lzf_c.c + ${CMAKE_SOURCE_DIR}/src/lzf_d.c + ${CMAKE_SOURCE_DIR}/src/pqsort.c + ${CMAKE_SOURCE_DIR}/src/zipmap.c + ${CMAKE_SOURCE_DIR}/src/sha1.c + ${CMAKE_SOURCE_DIR}/src/ziplist.c + ${CMAKE_SOURCE_DIR}/src/release.c + ${CMAKE_SOURCE_DIR}/src/memory_prefetch.c + ${CMAKE_SOURCE_DIR}/src/io_threads.c + ${CMAKE_SOURCE_DIR}/src/networking.c + ${CMAKE_SOURCE_DIR}/src/util.c + ${CMAKE_SOURCE_DIR}/src/object.c + ${CMAKE_SOURCE_DIR}/src/db.c + ${CMAKE_SOURCE_DIR}/src/replication.c + ${CMAKE_SOURCE_DIR}/src/rdb.c + ${CMAKE_SOURCE_DIR}/src/t_string.c + ${CMAKE_SOURCE_DIR}/src/t_list.c + ${CMAKE_SOURCE_DIR}/src/t_set.c + ${CMAKE_SOURCE_DIR}/src/t_zset.c + ${CMAKE_SOURCE_DIR}/src/t_hash.c + ${CMAKE_SOURCE_DIR}/src/config.c + ${CMAKE_SOURCE_DIR}/src/aof.c + ${CMAKE_SOURCE_DIR}/src/pubsub.c + ${CMAKE_SOURCE_DIR}/src/multi.c + ${CMAKE_SOURCE_DIR}/src/debug.c + ${CMAKE_SOURCE_DIR}/src/sort.c + ${CMAKE_SOURCE_DIR}/src/intset.c + ${CMAKE_SOURCE_DIR}/src/syncio.c + ${CMAKE_SOURCE_DIR}/src/cluster.c + ${CMAKE_SOURCE_DIR}/src/cluster_legacy.c + ${CMAKE_SOURCE_DIR}/src/cluster_slot_stats.c + ${CMAKE_SOURCE_DIR}/src/crc16.c + ${CMAKE_SOURCE_DIR}/src/endianconv.c + ${CMAKE_SOURCE_DIR}/src/slowlog.c + ${CMAKE_SOURCE_DIR}/src/eval.c + ${CMAKE_SOURCE_DIR}/src/bio.c + ${CMAKE_SOURCE_DIR}/src/rio.c + ${CMAKE_SOURCE_DIR}/src/rand.c + ${CMAKE_SOURCE_DIR}/src/memtest.c + ${CMAKE_SOURCE_DIR}/src/syscheck.c + ${CMAKE_SOURCE_DIR}/src/crcspeed.c + ${CMAKE_SOURCE_DIR}/src/crccombine.c + ${CMAKE_SOURCE_DIR}/src/crc64.c + ${CMAKE_SOURCE_DIR}/src/bitops.c + ${CMAKE_SOURCE_DIR}/src/sentinel.c + ${CMAKE_SOURCE_DIR}/src/notify.c + ${CMAKE_SOURCE_DIR}/src/setproctitle.c + ${CMAKE_SOURCE_DIR}/src/blocked.c + ${CMAKE_SOURCE_DIR}/src/hyperloglog.c + ${CMAKE_SOURCE_DIR}/src/latency.c + ${CMAKE_SOURCE_DIR}/src/sparkline.c + ${CMAKE_SOURCE_DIR}/src/valkey-check-rdb.c + ${CMAKE_SOURCE_DIR}/src/valkey-check-aof.c + ${CMAKE_SOURCE_DIR}/src/geo.c + ${CMAKE_SOURCE_DIR}/src/lazyfree.c + ${CMAKE_SOURCE_DIR}/src/module.c + ${CMAKE_SOURCE_DIR}/src/evict.c + ${CMAKE_SOURCE_DIR}/src/expire.c + ${CMAKE_SOURCE_DIR}/src/geohash.c + ${CMAKE_SOURCE_DIR}/src/geohash_helper.c + ${CMAKE_SOURCE_DIR}/src/childinfo.c + ${CMAKE_SOURCE_DIR}/src/allocator_defrag.c + ${CMAKE_SOURCE_DIR}/src/defrag.c + ${CMAKE_SOURCE_DIR}/src/siphash.c + ${CMAKE_SOURCE_DIR}/src/rax.c + ${CMAKE_SOURCE_DIR}/src/t_stream.c + ${CMAKE_SOURCE_DIR}/src/listpack.c + ${CMAKE_SOURCE_DIR}/src/localtime.c + ${CMAKE_SOURCE_DIR}/src/lolwut.c + ${CMAKE_SOURCE_DIR}/src/lolwut5.c + ${CMAKE_SOURCE_DIR}/src/lolwut6.c + ${CMAKE_SOURCE_DIR}/src/acl.c + ${CMAKE_SOURCE_DIR}/src/tracking.c + ${CMAKE_SOURCE_DIR}/src/socket.c + ${CMAKE_SOURCE_DIR}/src/tls.c + ${CMAKE_SOURCE_DIR}/src/rdma.c + ${CMAKE_SOURCE_DIR}/src/sha256.c + ${CMAKE_SOURCE_DIR}/src/timeout.c + ${CMAKE_SOURCE_DIR}/src/setcpuaffinity.c + ${CMAKE_SOURCE_DIR}/src/monotonic.c + ${CMAKE_SOURCE_DIR}/src/mt19937-64.c + ${CMAKE_SOURCE_DIR}/src/resp_parser.c + ${CMAKE_SOURCE_DIR}/src/call_reply.c + ${CMAKE_SOURCE_DIR}/src/script_lua.c + ${CMAKE_SOURCE_DIR}/src/script.c + ${CMAKE_SOURCE_DIR}/src/functions.c + ${CMAKE_SOURCE_DIR}/src/function_lua.c + ${CMAKE_SOURCE_DIR}/src/commands.c + ${CMAKE_SOURCE_DIR}/src/strl.c + ${CMAKE_SOURCE_DIR}/src/connection.c + ${CMAKE_SOURCE_DIR}/src/unix.c + ${CMAKE_SOURCE_DIR}/src/server.c + ${CMAKE_SOURCE_DIR}/src/logreqres.c) + +# valkey-cli +set(VALKEY_CLI_SRCS + ${CMAKE_SOURCE_DIR}/src/anet.c + ${CMAKE_SOURCE_DIR}/src/adlist.c + ${CMAKE_SOURCE_DIR}/src/dict.c + ${CMAKE_SOURCE_DIR}/src/valkey-cli.c + ${CMAKE_SOURCE_DIR}/src/zmalloc.c + ${CMAKE_SOURCE_DIR}/src/release.c + ${CMAKE_SOURCE_DIR}/src/ae.c + ${CMAKE_SOURCE_DIR}/src/serverassert.c + ${CMAKE_SOURCE_DIR}/src/crcspeed.c + ${CMAKE_SOURCE_DIR}/src/crccombine.c + ${CMAKE_SOURCE_DIR}/src/crc64.c + ${CMAKE_SOURCE_DIR}/src/siphash.c + ${CMAKE_SOURCE_DIR}/src/crc16.c + ${CMAKE_SOURCE_DIR}/src/monotonic.c + ${CMAKE_SOURCE_DIR}/src/cli_common.c + ${CMAKE_SOURCE_DIR}/src/mt19937-64.c + ${CMAKE_SOURCE_DIR}/src/strl.c + ${CMAKE_SOURCE_DIR}/src/cli_commands.c) + +# valkey-benchmark +set(VALKEY_BENCHMARK_SRCS + ${CMAKE_SOURCE_DIR}/src/ae.c + ${CMAKE_SOURCE_DIR}/src/anet.c + ${CMAKE_SOURCE_DIR}/src/valkey-benchmark.c + ${CMAKE_SOURCE_DIR}/src/adlist.c + ${CMAKE_SOURCE_DIR}/src/dict.c + ${CMAKE_SOURCE_DIR}/src/zmalloc.c + ${CMAKE_SOURCE_DIR}/src/serverassert.c + ${CMAKE_SOURCE_DIR}/src/release.c + ${CMAKE_SOURCE_DIR}/src/crcspeed.c + ${CMAKE_SOURCE_DIR}/src/crccombine.c + ${CMAKE_SOURCE_DIR}/src/crc64.c + ${CMAKE_SOURCE_DIR}/src/siphash.c + ${CMAKE_SOURCE_DIR}/src/crc16.c + ${CMAKE_SOURCE_DIR}/src/monotonic.c + ${CMAKE_SOURCE_DIR}/src/cli_common.c + ${CMAKE_SOURCE_DIR}/src/mt19937-64.c + ${CMAKE_SOURCE_DIR}/src/strl.c) + +# valkey-rdma module +set(VALKEY_RDMA_MODULE_SRCS ${CMAKE_SOURCE_DIR}/src/rdma.c) + +# valkey-tls module +set(VALKEY_TLS_MODULE_SRCS ${CMAKE_SOURCE_DIR}/src/tls.c) diff --git a/cmake/Modules/Utils.cmake b/cmake/Modules/Utils.cmake new file mode 100644 index 0000000000..59076397de --- /dev/null +++ b/cmake/Modules/Utils.cmake @@ -0,0 +1,115 @@ +# Return the current host distro name. For example: ubuntu, debian, amzn etc +function (valkey_get_distro_name DISTRO_NAME) + if (LINUX AND NOT APPLE) + execute_process( + COMMAND /bin/bash "-c" "cat /etc/os-release |grep ^ID=|cut -d = -f 2" + OUTPUT_VARIABLE _OUT_VAR + OUTPUT_STRIP_TRAILING_WHITESPACE) + # clean the output + string(REPLACE "\"" "" _OUT_VAR "${_OUT_VAR}") + string(REPLACE "." "" _OUT_VAR "${_OUT_VAR}") + set(${DISTRO_NAME} + "${_OUT_VAR}" + PARENT_SCOPE) + elseif (APPLE) + set(${DISTRO_NAME} + "darwin" + PARENT_SCOPE) + elseif (IS_FREEBSD) + set(${DISTRO_NAME} + "freebsd" + PARENT_SCOPE) + else () + set(${DISTRO_NAME} + "unknown" + PARENT_SCOPE) + endif () +endfunction () + +function (valkey_parse_version OUT_MAJOR OUT_MINOR OUT_PATCH) + # Read and parse package version from version.h file + file(STRINGS ${CMAKE_SOURCE_DIR}/src/version.h VERSION_LINES) + foreach (LINE ${VERSION_LINES}) + string(FIND "${LINE}" "#define VALKEY_VERSION " VERSION_STR_POS) + if (VERSION_STR_POS GREATER -1) + string(REPLACE "#define VALKEY_VERSION " "" LINE "${LINE}") + string(REPLACE "\"" "" LINE "${LINE}") + # Change "." to ";" to make it a list + string(REPLACE "." ";" LINE "${LINE}") + list(GET LINE 0 _MAJOR) + list(GET LINE 1 _MINOR) + list(GET LINE 2 _PATCH) + message(STATUS "Valkey version: ${_MAJOR}.${_MINOR}.${_PATCH}") + # Set the output variables + set(${OUT_MAJOR} + ${_MAJOR} + PARENT_SCOPE) + set(${OUT_MINOR} + ${_MINOR} + PARENT_SCOPE) + set(${OUT_PATCH} + ${_PATCH} + PARENT_SCOPE) + endif () + endforeach () +endfunction () + +# Given input argument `OPTION_VALUE`, check that the `OPTION_VALUE` is from the allowed values (one of: +# module/yes/no/1/0/true/false) +# +# Return value: +# +# If ARG is valid, return its number where: +# +# ~~~ +# - `no` | `0` | `off` => return `0` +# - `yes` | `1` | `on` => return `1` +# - `module` => return `2` +# ~~~ +function (valkey_parse_build_option OPTION_VALUE OUT_ARG_ENUM) + list(APPEND VALID_OPTIONS "yes") + list(APPEND VALID_OPTIONS "1") + list(APPEND VALID_OPTIONS "on") + list(APPEND VALID_OPTIONS "no") + list(APPEND VALID_OPTIONS "0") + list(APPEND VALID_OPTIONS "off") + list(APPEND VALID_OPTIONS "module") + + string(TOLOWER "${OPTION_VALUE}" OPTION_VALUE) + list(FIND VALID_OPTIONS "${ARG}" OPT_INDEX) + if (VERSION_STR_POS GREATER -1) + message(FATAL_ERROR "Invalid value passed ''${OPTION_VALUE}'") + endif () + + if ("${OPTION_VALUE}" STREQUAL "yes" + OR "${OPTION_VALUE}" STREQUAL "1" + OR "${OPTION_VALUE}" STREQUAL "on") + set(${OUT_ARG_ENUM} + 1 + PARENT_SCOPE) + elseif ( + "${OPTION_VALUE}" STREQUAL "no" + OR "${OPTION_VALUE}" STREQUAL "0" + OR "${OPTION_VALUE}" STREQUAL "off") + set(${OUT_ARG_ENUM} + 0 + PARENT_SCOPE) + else () + set(${OUT_ARG_ENUM} + 2 + PARENT_SCOPE) + endif () +endfunction () + +function (valkey_pkg_config PKGNAME OUT_VARIABLE) + if (NOT FOUND_PKGCONFIG) + # Locate pkg-config once + find_package(PkgConfig REQUIRED) + set(FOUND_PKGCONFIG 1) + endif () + pkg_check_modules(__PREFIX REQUIRED ${PKGNAME}) + message(STATUS "Found library for '${PKGNAME}': ${__PREFIX_LIBRARIES}") + set(${OUT_VARIABLE} + "${__PREFIX_LIBRARIES}" + PARENT_SCOPE) +endfunction () diff --git a/cmake/Modules/ValkeySetup.cmake b/cmake/Modules/ValkeySetup.cmake new file mode 100644 index 0000000000..8a4d4da1c9 --- /dev/null +++ b/cmake/Modules/ValkeySetup.cmake @@ -0,0 +1,391 @@ +include(CheckIncludeFiles) +include(ProcessorCount) +include(Utils) + +set(CMAKE_LIBRARY_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib") +set(CMAKE_RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/bin") +set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib") + +# Generate compile_commands.json file for IDEs code completion support +set(CMAKE_EXPORT_COMPILE_COMMANDS 1) + +processorcount(VALKEY_PROCESSOR_COUNT) +message(STATUS "Processor count: ${VALKEY_PROCESSOR_COUNT}") + +# Installed executables will have this permissions +set(VALKEY_EXE_PERMISSIONS + OWNER_EXECUTE + OWNER_WRITE + OWNER_READ + GROUP_EXECUTE + GROUP_READ + WORLD_EXECUTE + WORLD_READ) + +set(VALKEY_SERVER_CFLAGS "") +set(VALKEY_SERVER_LDFLAGS "") + +# ---------------------------------------------------- +# Helper functions & macros +# ---------------------------------------------------- +macro (add_valkey_server_compiler_options value) + set(VALKEY_SERVER_CFLAGS "${VALKEY_SERVER_CFLAGS} ${value}") +endmacro () + +macro (add_valkey_server_linker_option value) + list(APPEND VALKEY_SERVER_LDFLAGS ${value}) +endmacro () + +macro (get_valkey_server_linker_option return_value) + list(JOIN VALKEY_SERVER_LDFLAGS " " ${value} ${return_value}) +endmacro () + +set(IS_FREEBSD 0) +if (CMAKE_SYSTEM_NAME MATCHES "^.*BSD$|DragonFly") + message(STATUS "Building for FreeBSD compatible system") + set(IS_FREEBSD 1) + include_directories("/usr/local/include") + add_valkey_server_compiler_options("-DUSE_BACKTRACE") +endif () + +# Helper function for creating symbolic link so that: link -> source +macro (valkey_create_symlink source link) + install( + CODE "execute_process( \ + COMMAND /bin/bash ${CMAKE_BINARY_DIR}/CreateSymlink.sh \ + ${source} \ + ${link} \ + )" + COMPONENT "valkey") +endmacro () + +# Install a binary +macro (valkey_install_bin target) + # Install cli tool and create a redis symbolic link + install( + TARGETS ${target} + DESTINATION ${CMAKE_INSTALL_BINDIR} + PERMISSIONS ${VALKEY_EXE_PERMISSIONS} + COMPONENT "valkey") +endmacro () + +# Helper function that defines, builds and installs `target` In addition, it creates a symbolic link between the target +# and `link_name` +macro (valkey_build_and_install_bin target sources ld_flags libs link_name) + add_executable(${target} ${sources}) + + if (USE_JEMALLOC + OR USE_TCMALLOC + OR USE_TCMALLOC_MINIMAL) + # Using custom allocator + target_link_libraries(${target} ${ALLOCATOR_LIB}) + endif () + + # Place this line last to ensure that ${ld_flags} is placed last on the linker line + target_link_libraries(${target} ${libs} ${ld_flags}) + target_link_libraries(${target} hiredis) + if (USE_TLS) + # Add required libraries needed for TLS + target_link_libraries(${target} OpenSSL::SSL hiredis_ssl) + endif () + + if (IS_FREEBSD) + target_link_libraries(${target} execinfo) + endif () + + # Install cli tool and create a redis symbolic link + valkey_install_bin(${target}) + valkey_create_symlink(${target} ${link_name}) +endmacro () + +# Helper function that defines, builds and installs `target` module. +macro (valkey_build_and_install_module target sources ld_flags libs) + add_library(${target} SHARED ${sources}) + + if (USE_JEMALLOC) + # Using jemalloc + target_link_libraries(${target} jemalloc) + endif () + + # Place this line last to ensure that ${ld_flags} is placed last on the linker line + target_link_libraries(${target} ${libs} ${ld_flags}) + if (USE_TLS) + # Add required libraries needed for TLS + target_link_libraries(${target} OpenSSL::SSL hiredis_ssl) + endif () + + if (IS_FREEBSD) + target_link_libraries(${target} execinfo) + endif () + + # Install cli tool and create a redis symbolic link + valkey_install_bin(${target}) +endmacro () + +# Determine if we are building in Release or Debug mode +if (CMAKE_BUILD_TYPE MATCHES Debug OR CMAKE_BUILD_TYPE MATCHES DebugFull) + set(VALKEY_DEBUG_BUILD 1) + set(VALKEY_RELEASE_BUILD 0) + message(STATUS "Building in debug mode") +else () + set(VALKEY_DEBUG_BUILD 0) + set(VALKEY_RELEASE_BUILD 1) + message(STATUS "Building in release mode") +endif () + +# ---------------------------------------------------- +# Helper functions - end +# ---------------------------------------------------- + +# ---------------------------------------------------- +# Build options (allocator, tls, rdma et al) +# ---------------------------------------------------- + +if (NOT BUILD_MALLOC) + if (APPLE) + set(BUILD_MALLOC "libc") + elseif (UNIX) + set(BUILD_MALLOC "jemalloc") + endif () +endif () + +# User may pass different allocator library. Using -DBUILD_MALLOC=, make sure it is a valid value +if (BUILD_MALLOC) + if ("${BUILD_MALLOC}" STREQUAL "jemalloc") + set(MALLOC_LIB "jemalloc") + set(ALLOCATOR_LIB "jemalloc") + add_valkey_server_compiler_options("-DUSE_JEMALLOC") + set(USE_JEMALLOC 1) + elseif ("${BUILD_MALLOC}" STREQUAL "libc") + set(MALLOC_LIB "libc") + elseif ("${BUILD_MALLOC}" STREQUAL "tcmalloc") + set(MALLOC_LIB "tcmalloc") + valkey_pkg_config(libtcmalloc ALLOCATOR_LIB) + + add_valkey_server_compiler_options("-DUSE_TCMALLOC") + set(USE_TCMALLOC 1) + elseif ("${BUILD_MALLOC}" STREQUAL "tcmalloc_minimal") + set(MALLOC_LIB "tcmalloc_minimal") + valkey_pkg_config(libtcmalloc_minimal ALLOCATOR_LIB) + + add_valkey_server_compiler_options("-DUSE_TCMALLOC") + set(USE_TCMALLOC_MINIMAL 1) + else () + message(FATAL_ERROR "BUILD_MALLOC can be one of: jemalloc, libc, tcmalloc or tcmalloc_minimal") + endif () +endif () + +message(STATUS "Using ${MALLOC_LIB}") + +# TLS support +if (BUILD_TLS) + valkey_parse_build_option(${BUILD_TLS} USE_TLS) + if (USE_TLS EQUAL 1) + # Only search for OpenSSL if needed + find_package(OpenSSL REQUIRED) + message(STATUS "OpenSSL include dir: ${OPENSSL_INCLUDE_DIR}") + message(STATUS "OpenSSL libraries: ${OPENSSL_LIBRARIES}") + include_directories(${OPENSSL_INCLUDE_DIR}) + endif () + + if (USE_TLS EQUAL 1) + add_valkey_server_compiler_options("-DUSE_OPENSSL=1") + add_valkey_server_compiler_options("-DBUILD_TLS_MODULE=0") + else () + # Build TLS as a module RDMA can only be built as a module. So disable it + message(WARNING "BUILD_TLS can be one of: [ON | OFF | 1 | 0], but '${BUILD_TLS}' was provided") + message(STATUS "TLS support is disabled") + set(USE_TLS 0) + endif () +else () + # By default, TLS is disabled + message(STATUS "TLS is disabled") + set(USE_TLS 0) +endif () + +if (BUILD_RDMA) + set(BUILD_RDMA_MODULE 0) + # RDMA support (Linux only) + if (LINUX AND NOT APPLE) + valkey_parse_build_option(${BUILD_RDMA} USE_RDMA) + find_package(PkgConfig REQUIRED) + # Locate librdmacm & libibverbs, fail if we can't find them + valkey_pkg_config(librdmacm RDMACM_LIBS) + valkey_pkg_config(libibverbs IBVERBS_LIBS) + message(STATUS "${RDMACM_LIBS};${IBVERBS_LIBS}") + list(APPEND RDMA_LIBS "${RDMACM_LIBS};${IBVERBS_LIBS}") + + if (USE_RDMA EQUAL 2) # Module + message(STATUS "Building RDMA as module") + add_valkey_server_compiler_options("-DUSE_RDMA=2") + set(BUILD_RDMA_MODULE 2) + elseif (USE_RDMA EQUAL 1) # Builtin + message(STATUS "Building RDMA as builtin") + add_valkey_server_compiler_options("-DUSE_RDMA=1") + add_valkey_server_compiler_options("-DBUILD_RDMA_MODULE=0") + list(APPEND SERVER_LIBS "${RDMA_LIBS}") + endif () + else () + message(WARNING "RDMA is only supported on Linux platforms") + endif () +else () + # By default, RDMA is disabled + message(STATUS "RDMA is disabled") + set(USE_RDMA 0) +endif () + +set(BUILDING_ARM64 0) +set(BUILDING_ARM32 0) + +if ("${CMAKE_SYSTEM_PROCESSOR}" STREQUAL "arm64") + set(BUILDING_ARM64 1) +endif () + +if ("${CMAKE_SYSTEM_PROCESSOR}" STREQUAL "arm") + set(BUILDING_ARM32 1) +endif () + +message(STATUS "Building on ${CMAKE_HOST_SYSTEM_NAME}") +if (BUILDING_ARM64) + message(STATUS "Compiling valkey for ARM64") + add_valkey_server_linker_option("-funwind-tables") +endif () + +if (APPLE) + add_valkey_server_linker_option("-rdynamic") + add_valkey_server_linker_option("-ldl") +elseif (UNIX) + add_valkey_server_linker_option("-rdynamic") + add_valkey_server_linker_option("-pthread") + add_valkey_server_linker_option("-ldl") + add_valkey_server_linker_option("-lm") +endif () + +if (VALKEY_DEBUG_BUILD) + # Debug build, use enable "-fno-omit-frame-pointer" + add_valkey_server_compiler_options("-fno-omit-frame-pointer") +endif () + +# Check for Atomic +check_include_files(stdatomic.h HAVE_C11_ATOMIC) +if (HAVE_C11_ATOMIC) + add_valkey_server_compiler_options("-std=gnu11") +else () + add_valkey_server_compiler_options("-std=c99") +endif () + +# Sanitizer +if (BUILD_SANITIZER) + # Common CFLAGS + list(APPEND VALKEY_SANITAIZER_CFLAGS "-fno-sanitize-recover=all") + list(APPEND VALKEY_SANITAIZER_CFLAGS "-fno-omit-frame-pointer") + if ("${BUILD_SANITIZER}" STREQUAL "address") + list(APPEND VALKEY_SANITAIZER_CFLAGS "-fsanitize=address") + list(APPEND VALKEY_SANITAIZER_LDFLAGS "-fsanitize=address") + elseif ("${BUILD_SANITIZER}" STREQUAL "thread") + list(APPEND VALKEY_SANITAIZER_CFLAGS "-fsanitize=thread") + list(APPEND VALKEY_SANITAIZER_LDFLAGS "-fsanitize=thread") + elseif ("${BUILD_SANITIZER}" STREQUAL "undefined") + list(APPEND VALKEY_SANITAIZER_CFLAGS "-fsanitize=undefined") + list(APPEND VALKEY_SANITAIZER_LDFLAGS "-fsanitize=undefined") + else () + message(FATAL_ERROR "Unknown sanitizer: ${BUILD_SANITIZER}") + endif () +endif () + +include_directories("${CMAKE_SOURCE_DIR}/deps/hiredis") +include_directories("${CMAKE_SOURCE_DIR}/deps/linenoise") +include_directories("${CMAKE_SOURCE_DIR}/deps/lua/src") +include_directories("${CMAKE_SOURCE_DIR}/deps/hdr_histogram") +include_directories("${CMAKE_SOURCE_DIR}/deps/fpconv") + +add_subdirectory("${CMAKE_SOURCE_DIR}/deps") + +# Update linker flags for the allocator +if (USE_JEMALLOC) + include_directories("${CMAKE_SOURCE_DIR}/deps/jemalloc/include") +endif () + +# Common compiler flags +add_valkey_server_compiler_options("-pedantic") + +# ---------------------------------------------------- +# Build options (allocator, tls, rdma et al) - end +# ---------------------------------------------------- + +# ------------------------------------------------- +# Code Generation section +# ------------------------------------------------- +find_program(PYTHON_EXE python3) +if (PYTHON_EXE) + # Python based code generation + message(STATUS "Found python3: ${PYTHON_EXE}") + # Rule for generating commands.def file from json files + message(STATUS "Adding target generate_commands_def") + file(GLOB COMMAND_FILES_JSON "${CMAKE_SOURCE_DIR}/src/commands/*.json") + add_custom_command( + OUTPUT ${CMAKE_BINARY_DIR}/commands_def_generated + DEPENDS ${COMMAND_FILES_JSON} + COMMAND ${PYTHON_EXE} ${CMAKE_SOURCE_DIR}/utils/generate-command-code.py + COMMAND touch ${CMAKE_BINARY_DIR}/commands_def_generated + WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}/src") + add_custom_target(generate_commands_def DEPENDS ${CMAKE_BINARY_DIR}/commands_def_generated) + + # Rule for generating fmtargs.h + message(STATUS "Adding target generate_fmtargs_h") + add_custom_command( + OUTPUT ${CMAKE_BINARY_DIR}/fmtargs_generated + DEPENDS ${CMAKE_SOURCE_DIR}/utils/generate-fmtargs.py + COMMAND sed '/Everything/,$$d' fmtargs.h > fmtargs.h.tmp + COMMAND ${PYTHON_EXE} ${CMAKE_SOURCE_DIR}/utils/generate-fmtargs.py >> fmtargs.h.tmp + COMMAND mv fmtargs.h.tmp fmtargs.h + COMMAND touch ${CMAKE_BINARY_DIR}/fmtargs_generated + WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}/src") + add_custom_target(generate_fmtargs_h DEPENDS ${CMAKE_BINARY_DIR}/fmtargs_generated) + + # Rule for generating test_files.h + message(STATUS "Adding target generate_test_files_h") + file(GLOB UNIT_TEST_SRCS "${CMAKE_SOURCE_DIR}/src/unit/*.c") + add_custom_command( + OUTPUT ${CMAKE_BINARY_DIR}/test_files_generated + DEPENDS "${UNIT_TEST_SRCS};${CMAKE_SOURCE_DIR}/utils/generate-unit-test-header.py" + COMMAND ${PYTHON_EXE} ${CMAKE_SOURCE_DIR}/utils/generate-unit-test-header.py + COMMAND touch ${CMAKE_BINARY_DIR}/test_files_generated + WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}/src") + add_custom_target(generate_test_files_h DEPENDS ${CMAKE_BINARY_DIR}/test_files_generated) +else () + # Fake targets + add_custom_target(generate_commands_def) + add_custom_target(generate_fmtargs_h) + add_custom_target(generate_test_files_h) +endif () + +# Generate release.h file (always) +add_custom_target( + release_header + COMMAND sh -c '${CMAKE_SOURCE_DIR}/src/mkreleasehdr.sh' + WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}/src") + +# ------------------------------------------------- +# Code Generation section - end +# ------------------------------------------------- + +# ---------------------------------------------------------- +# All our source files are defined in SourceFiles.cmake file +# ---------------------------------------------------------- +include(SourceFiles) + +# Clear the below variables from the cache +unset(CMAKE_C_FLAGS CACHE) +unset(VALKEY_SERVER_LDFLAGS CACHE) +unset(VALKEY_SERVER_CFLAGS CACHE) +unset(PYTHON_EXE CACHE) +unset(HAVE_C11_ATOMIC CACHE) +unset(USE_TLS CACHE) +unset(USE_RDMA CACHE) +unset(BUILD_TLS CACHE) +unset(BUILD_RDMA CACHE) +unset(BUILD_MALLOC CACHE) +unset(USE_JEMALLOC CACHE) +unset(BUILD_TLS_MODULE CACHE) +unset(BUILD_TLS_BUILTIN CACHE) diff --git a/deps/CMakeLists.txt b/deps/CMakeLists.txt new file mode 100644 index 0000000000..c904b94031 --- /dev/null +++ b/deps/CMakeLists.txt @@ -0,0 +1,26 @@ +add_subdirectory(jemalloc) +add_subdirectory(lua) + +# Set hiredis options. We need to disable the defaults set in the OPTION(..) we do this by setting them in the CACHE +set(BUILD_SHARED_LIBS + OFF + CACHE BOOL "Build shared libraries") +set(DISABLE_TESTS + ON + CACHE BOOL "If tests should be compiled or not") +if (USE_TLS) # Module or no module + message(STATUS "Building hiredis_ssl") + set(ENABLE_SSL + ON + CACHE BOOL "Should we test SSL connections") +endif () + +add_subdirectory(hiredis) +add_subdirectory(linenoise) +add_subdirectory(fpconv) +add_subdirectory(hdr_histogram) + +# Clear any cached variables passed to hiredis from the cache +unset(BUILD_SHARED_LIBS CACHE) +unset(DISABLE_TESTS CACHE) +unset(ENABLE_SSL CACHE) diff --git a/deps/Makefile b/deps/Makefile index f1e4bd6ce2..72389def95 100644 --- a/deps/Makefile +++ b/deps/Makefile @@ -42,6 +42,7 @@ distclean: -(cd jemalloc && [ -f Makefile ] && $(MAKE) distclean) > /dev/null || true -(cd hdr_histogram && $(MAKE) clean) > /dev/null || true -(cd fpconv && $(MAKE) clean) > /dev/null || true + -(cd fast_float_c_interface && $(MAKE) clean) > /dev/null || true -(rm -f .make-*) .PHONY: distclean @@ -116,3 +117,9 @@ jemalloc: .make-prerequisites cd jemalloc && $(MAKE) lib/libjemalloc.a .PHONY: jemalloc + +fast_float_c_interface: .make-prerequisites + @printf '%b %b\n' $(MAKECOLOR)MAKE$(ENDCOLOR) $(BINCOLOR)$@$(ENDCOLOR) + cd fast_float_c_interface && $(MAKE) + +.PHONY: fast_float_c_interface diff --git a/deps/README.md b/deps/README.md index b918b47456..97a7baf64b 100644 --- a/deps/README.md +++ b/deps/README.md @@ -6,6 +6,7 @@ should be provided by the operating system. * **linenoise** is a readline replacement. It is developed by the same authors of Valkey but is managed as a separated project and updated as needed. * **lua** is Lua 5.1 with minor changes for security and additional libraries. * **hdr_histogram** Used for per-command latency tracking histograms. +* **fast_float** is a replacement for strtod to convert strings to floats efficiently. How to upgrade the above dependencies === @@ -105,3 +106,17 @@ We use a customized version based on master branch commit e4448cf6d1cd08fff51981 2. Copy updated files from newer version onto files in /hdr_histogram. 3. Apply the changes from 1 above to the updated files. +fast_float +--- +The fast_float library provides fast header-only implementations for the C++ from_chars functions for `float` and `double` types as well as integer types. These functions convert ASCII strings representing decimal values (e.g., `1.3e10`) into binary types. The functions are much faster than comparable number-parsing functions from existing C++ standard libraries. + +Specifically, `fast_float` provides the following function to parse floating-point numbers with a C++17-like syntax (the library itself only requires C++11): + + template ())> + from_chars_result_t from_chars(UC const *first, UC const *last, T &value, chars_format fmt = chars_format::general); + +To upgrade the library, +1. Check out https://github.com/fastfloat/fast_float/tree/main +2. cd fast_float +3. Invoke "python3 ./script/amalgamate.py --output fast_float.h" +4. Copy fast_float.h file to "deps/fast_float/". diff --git a/deps/fast_float/fast_float.h b/deps/fast_float/fast_float.h new file mode 100644 index 0000000000..9ba3bc2e97 --- /dev/null +++ b/deps/fast_float/fast_float.h @@ -0,0 +1,3912 @@ +// fast_float by Daniel Lemire +// fast_float by João Paulo Magalhaes +// +// +// with contributions from Eugene Golushkov +// with contributions from Maksim Kita +// with contributions from Marcin Wojdyr +// with contributions from Neal Richardson +// with contributions from Tim Paine +// with contributions from Fabio Pellacini +// with contributions from Lénárd Szolnoki +// with contributions from Jan Pharago +// with contributions from Maya Warrier +// with contributions from Taha Khokhar +// +// +// Licensed under the Apache License, Version 2.0, or the +// MIT License or the Boost License. This file may not be copied, +// modified, or distributed except according to those terms. +// +// MIT License Notice +// +// MIT License +// +// Copyright (c) 2021 The fast_float authors +// +// Permission is hereby granted, free of charge, to any +// person obtaining a copy of this software and associated +// documentation files (the "Software"), to deal in the +// Software without restriction, including without +// limitation the rights to use, copy, modify, merge, +// publish, distribute, sublicense, and/or sell copies of +// the Software, and to permit persons to whom the Software +// is furnished to do so, subject to the following +// conditions: +// +// The above copyright notice and this permission notice +// shall be included in all copies or substantial portions +// of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF +// ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED +// TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A +// PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT +// SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +// CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION +// OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR +// IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS IN THE SOFTWARE. +// +// Apache License (Version 2.0) Notice +// +// Copyright 2021 The fast_float authors +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// +// BOOST License Notice +// +// Boost Software License - Version 1.0 - August 17th, 2003 +// +// Permission is hereby granted, free of charge, to any person or organization +// obtaining a copy of the software and accompanying documentation covered by +// this license (the "Software") to use, reproduce, display, distribute, +// execute, and transmit the Software, and to prepare derivative works of the +// Software, and to permit third-parties to whom the Software is furnished to +// do so, all subject to the following: +// +// The copyright notices in the Software and this entire statement, including +// the above license grant, this restriction and the following disclaimer, +// must be included in all copies of the Software, in whole or in part, and +// all derivative works of the Software, unless such copies or derivative +// works are solely in the form of machine-executable object code generated by +// a source language processor. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT +// SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE +// FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS IN THE SOFTWARE. +// + +#ifndef FASTFLOAT_CONSTEXPR_FEATURE_DETECT_H +#define FASTFLOAT_CONSTEXPR_FEATURE_DETECT_H + +#ifdef __has_include +#if __has_include() +#include +#endif +#endif + +// Testing for https://wg21.link/N3652, adopted in C++14 +#if __cpp_constexpr >= 201304 +#define FASTFLOAT_CONSTEXPR14 constexpr +#else +#define FASTFLOAT_CONSTEXPR14 +#endif + +#if defined(__cpp_lib_bit_cast) && __cpp_lib_bit_cast >= 201806L +#define FASTFLOAT_HAS_BIT_CAST 1 +#else +#define FASTFLOAT_HAS_BIT_CAST 0 +#endif + +#if defined(__cpp_lib_is_constant_evaluated) && \ + __cpp_lib_is_constant_evaluated >= 201811L +#define FASTFLOAT_HAS_IS_CONSTANT_EVALUATED 1 +#else +#define FASTFLOAT_HAS_IS_CONSTANT_EVALUATED 0 +#endif + +// Testing for relevant C++20 constexpr library features +#if FASTFLOAT_HAS_IS_CONSTANT_EVALUATED && FASTFLOAT_HAS_BIT_CAST && \ + __cpp_lib_constexpr_algorithms >= 201806L /*For std::copy and std::fill*/ +#define FASTFLOAT_CONSTEXPR20 constexpr +#define FASTFLOAT_IS_CONSTEXPR 1 +#else +#define FASTFLOAT_CONSTEXPR20 +#define FASTFLOAT_IS_CONSTEXPR 0 +#endif + +#if __cplusplus >= 201703L || (defined(_MSVC_LANG) && _MSVC_LANG >= 201703L) +#define FASTFLOAT_DETAIL_MUST_DEFINE_CONSTEXPR_VARIABLE 0 +#else +#define FASTFLOAT_DETAIL_MUST_DEFINE_CONSTEXPR_VARIABLE 1 +#endif + +#endif // FASTFLOAT_CONSTEXPR_FEATURE_DETECT_H + +#ifndef FASTFLOAT_FLOAT_COMMON_H +#define FASTFLOAT_FLOAT_COMMON_H + +#include +#include +#include +#include +#include +#include +#ifdef __has_include +#if __has_include() && (__cplusplus > 202002L || _MSVC_LANG > 202002L) +#include +#endif +#endif + +namespace fast_float { + +#define FASTFLOAT_JSONFMT (1 << 5) +#define FASTFLOAT_FORTRANFMT (1 << 6) + +enum chars_format { + scientific = 1 << 0, + fixed = 1 << 2, + hex = 1 << 3, + no_infnan = 1 << 4, + // RFC 8259: https://datatracker.ietf.org/doc/html/rfc8259#section-6 + json = FASTFLOAT_JSONFMT | fixed | scientific | no_infnan, + // Extension of RFC 8259 where, e.g., "inf" and "nan" are allowed. + json_or_infnan = FASTFLOAT_JSONFMT | fixed | scientific, + fortran = FASTFLOAT_FORTRANFMT | fixed | scientific, + general = fixed | scientific +}; + +template struct from_chars_result_t { + UC const *ptr; + std::errc ec; +}; +using from_chars_result = from_chars_result_t; + +template struct parse_options_t { + constexpr explicit parse_options_t(chars_format fmt = chars_format::general, + UC dot = UC('.')) + : format(fmt), decimal_point(dot) {} + + /** Which number formats are accepted */ + chars_format format; + /** The character used as decimal point */ + UC decimal_point; +}; +using parse_options = parse_options_t; + +} // namespace fast_float + +#if FASTFLOAT_HAS_BIT_CAST +#include +#endif + +#if (defined(__x86_64) || defined(__x86_64__) || defined(_M_X64) || \ + defined(__amd64) || defined(__aarch64__) || defined(_M_ARM64) || \ + defined(__MINGW64__) || defined(__s390x__) || \ + (defined(__ppc64__) || defined(__PPC64__) || defined(__ppc64le__) || \ + defined(__PPC64LE__)) || \ + defined(__loongarch64)) +#define FASTFLOAT_64BIT 1 +#elif (defined(__i386) || defined(__i386__) || defined(_M_IX86) || \ + defined(__arm__) || defined(_M_ARM) || defined(__ppc__) || \ + defined(__MINGW32__) || defined(__EMSCRIPTEN__)) +#define FASTFLOAT_32BIT 1 +#else + // Need to check incrementally, since SIZE_MAX is a size_t, avoid overflow. +// We can never tell the register width, but the SIZE_MAX is a good +// approximation. UINTPTR_MAX and INTPTR_MAX are optional, so avoid them for max +// portability. +#if SIZE_MAX == 0xffff +#error Unknown platform (16-bit, unsupported) +#elif SIZE_MAX == 0xffffffff +#define FASTFLOAT_32BIT 1 +#elif SIZE_MAX == 0xffffffffffffffff +#define FASTFLOAT_64BIT 1 +#else +#error Unknown platform (not 32-bit, not 64-bit?) +#endif +#endif + +#if ((defined(_WIN32) || defined(_WIN64)) && !defined(__clang__)) || \ + (defined(_M_ARM64) && !defined(__MINGW32__)) +#include +#endif + +#if defined(_MSC_VER) && !defined(__clang__) +#define FASTFLOAT_VISUAL_STUDIO 1 +#endif + +#if defined __BYTE_ORDER__ && defined __ORDER_BIG_ENDIAN__ +#define FASTFLOAT_IS_BIG_ENDIAN (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) +#elif defined _WIN32 +#define FASTFLOAT_IS_BIG_ENDIAN 0 +#else +#if defined(__APPLE__) || defined(__FreeBSD__) +#include +#elif defined(sun) || defined(__sun) +#include +#elif defined(__MVS__) +#include +#else +#ifdef __has_include +#if __has_include() +#include +#endif //__has_include() +#endif //__has_include +#endif +# +#ifndef __BYTE_ORDER__ +// safe choice +#define FASTFLOAT_IS_BIG_ENDIAN 0 +#endif +# +#ifndef __ORDER_LITTLE_ENDIAN__ +// safe choice +#define FASTFLOAT_IS_BIG_ENDIAN 0 +#endif +# +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ +#define FASTFLOAT_IS_BIG_ENDIAN 0 +#else +#define FASTFLOAT_IS_BIG_ENDIAN 1 +#endif +#endif + +#if defined(__SSE2__) || (defined(FASTFLOAT_VISUAL_STUDIO) && \ + (defined(_M_AMD64) || defined(_M_X64) || \ + (defined(_M_IX86_FP) && _M_IX86_FP == 2))) +#define FASTFLOAT_SSE2 1 +#endif + +#if defined(__aarch64__) || defined(_M_ARM64) +#define FASTFLOAT_NEON 1 +#endif + +#if defined(FASTFLOAT_SSE2) || defined(FASTFLOAT_NEON) +#define FASTFLOAT_HAS_SIMD 1 +#endif + +#if defined(__GNUC__) +// disable -Wcast-align=strict (GCC only) +#define FASTFLOAT_SIMD_DISABLE_WARNINGS \ + _Pragma("GCC diagnostic push") \ + _Pragma("GCC diagnostic ignored \"-Wcast-align\"") +#else +#define FASTFLOAT_SIMD_DISABLE_WARNINGS +#endif + +#if defined(__GNUC__) +#define FASTFLOAT_SIMD_RESTORE_WARNINGS _Pragma("GCC diagnostic pop") +#else +#define FASTFLOAT_SIMD_RESTORE_WARNINGS +#endif + +#ifdef FASTFLOAT_VISUAL_STUDIO +#define fastfloat_really_inline __forceinline +#else +#define fastfloat_really_inline inline __attribute__((always_inline)) +#endif + +#ifndef FASTFLOAT_ASSERT +#define FASTFLOAT_ASSERT(x) \ + { ((void)(x)); } +#endif + +#ifndef FASTFLOAT_DEBUG_ASSERT +#define FASTFLOAT_DEBUG_ASSERT(x) \ + { ((void)(x)); } +#endif + +// rust style `try!()` macro, or `?` operator +#define FASTFLOAT_TRY(x) \ + { \ + if (!(x)) \ + return false; \ + } + +#define FASTFLOAT_ENABLE_IF(...) \ + typename std::enable_if<(__VA_ARGS__), int>::type + +namespace fast_float { + +fastfloat_really_inline constexpr bool cpp20_and_in_constexpr() { +#if FASTFLOAT_HAS_IS_CONSTANT_EVALUATED + return std::is_constant_evaluated(); +#else + return false; +#endif +} + +template +fastfloat_really_inline constexpr bool is_supported_float_type() { + return std::is_same::value || std::is_same::value +#if __STDCPP_FLOAT32_T__ + || std::is_same::value +#endif +#if __STDCPP_FLOAT64_T__ + || std::is_same::value +#endif + ; +} + +template +fastfloat_really_inline constexpr bool is_supported_char_type() { + return std::is_same::value || std::is_same::value || + std::is_same::value || std::is_same::value; +} + +// Compares two ASCII strings in a case insensitive manner. +template +inline FASTFLOAT_CONSTEXPR14 bool +fastfloat_strncasecmp(UC const *input1, UC const *input2, size_t length) { + char running_diff{0}; + for (size_t i = 0; i < length; ++i) { + running_diff |= (char(input1[i]) ^ char(input2[i])); + } + return (running_diff == 0) || (running_diff == 32); +} + +#ifndef FLT_EVAL_METHOD +#error "FLT_EVAL_METHOD should be defined, please include cfloat." +#endif + +// a pointer and a length to a contiguous block of memory +template struct span { + const T *ptr; + size_t length; + constexpr span(const T *_ptr, size_t _length) : ptr(_ptr), length(_length) {} + constexpr span() : ptr(nullptr), length(0) {} + + constexpr size_t len() const noexcept { return length; } + + FASTFLOAT_CONSTEXPR14 const T &operator[](size_t index) const noexcept { + FASTFLOAT_DEBUG_ASSERT(index < length); + return ptr[index]; + } +}; + +struct value128 { + uint64_t low; + uint64_t high; + constexpr value128(uint64_t _low, uint64_t _high) : low(_low), high(_high) {} + constexpr value128() : low(0), high(0) {} +}; + +/* Helper C++14 constexpr generic implementation of leading_zeroes */ +fastfloat_really_inline FASTFLOAT_CONSTEXPR14 int +leading_zeroes_generic(uint64_t input_num, int last_bit = 0) { + if (input_num & uint64_t(0xffffffff00000000)) { + input_num >>= 32; + last_bit |= 32; + } + if (input_num & uint64_t(0xffff0000)) { + input_num >>= 16; + last_bit |= 16; + } + if (input_num & uint64_t(0xff00)) { + input_num >>= 8; + last_bit |= 8; + } + if (input_num & uint64_t(0xf0)) { + input_num >>= 4; + last_bit |= 4; + } + if (input_num & uint64_t(0xc)) { + input_num >>= 2; + last_bit |= 2; + } + if (input_num & uint64_t(0x2)) { /* input_num >>= 1; */ + last_bit |= 1; + } + return 63 - last_bit; +} + +/* result might be undefined when input_num is zero */ +fastfloat_really_inline FASTFLOAT_CONSTEXPR20 int +leading_zeroes(uint64_t input_num) { + assert(input_num > 0); + if (cpp20_and_in_constexpr()) { + return leading_zeroes_generic(input_num); + } +#ifdef FASTFLOAT_VISUAL_STUDIO +#if defined(_M_X64) || defined(_M_ARM64) + unsigned long leading_zero = 0; + // Search the mask data from most significant bit (MSB) + // to least significant bit (LSB) for a set bit (1). + _BitScanReverse64(&leading_zero, input_num); + return (int)(63 - leading_zero); +#else + return leading_zeroes_generic(input_num); +#endif +#else + return __builtin_clzll(input_num); +#endif +} + +// slow emulation routine for 32-bit +fastfloat_really_inline constexpr uint64_t emulu(uint32_t x, uint32_t y) { + return x * (uint64_t)y; +} + +fastfloat_really_inline FASTFLOAT_CONSTEXPR14 uint64_t +umul128_generic(uint64_t ab, uint64_t cd, uint64_t *hi) { + uint64_t ad = emulu((uint32_t)(ab >> 32), (uint32_t)cd); + uint64_t bd = emulu((uint32_t)ab, (uint32_t)cd); + uint64_t adbc = ad + emulu((uint32_t)ab, (uint32_t)(cd >> 32)); + uint64_t adbc_carry = (uint64_t)(adbc < ad); + uint64_t lo = bd + (adbc << 32); + *hi = emulu((uint32_t)(ab >> 32), (uint32_t)(cd >> 32)) + (adbc >> 32) + + (adbc_carry << 32) + (uint64_t)(lo < bd); + return lo; +} + +#ifdef FASTFLOAT_32BIT + +// slow emulation routine for 32-bit +#if !defined(__MINGW64__) +fastfloat_really_inline FASTFLOAT_CONSTEXPR14 uint64_t _umul128(uint64_t ab, + uint64_t cd, + uint64_t *hi) { + return umul128_generic(ab, cd, hi); +} +#endif // !__MINGW64__ + +#endif // FASTFLOAT_32BIT + +// compute 64-bit a*b +fastfloat_really_inline FASTFLOAT_CONSTEXPR20 value128 +full_multiplication(uint64_t a, uint64_t b) { + if (cpp20_and_in_constexpr()) { + value128 answer; + answer.low = umul128_generic(a, b, &answer.high); + return answer; + } + value128 answer; +#if defined(_M_ARM64) && !defined(__MINGW32__) + // ARM64 has native support for 64-bit multiplications, no need to emulate + // But MinGW on ARM64 doesn't have native support for 64-bit multiplications + answer.high = __umulh(a, b); + answer.low = a * b; +#elif defined(FASTFLOAT_32BIT) || \ + (defined(_WIN64) && !defined(__clang__) && !defined(_M_ARM64)) + answer.low = _umul128(a, b, &answer.high); // _umul128 not available on ARM64 +#elif defined(FASTFLOAT_64BIT) && defined(__SIZEOF_INT128__) + __uint128_t r = ((__uint128_t)a) * b; + answer.low = uint64_t(r); + answer.high = uint64_t(r >> 64); +#else + answer.low = umul128_generic(a, b, &answer.high); +#endif + return answer; +} + +struct adjusted_mantissa { + uint64_t mantissa{0}; + int32_t power2{0}; // a negative value indicates an invalid result + adjusted_mantissa() = default; + constexpr bool operator==(const adjusted_mantissa &o) const { + return mantissa == o.mantissa && power2 == o.power2; + } + constexpr bool operator!=(const adjusted_mantissa &o) const { + return mantissa != o.mantissa || power2 != o.power2; + } +}; + +// Bias so we can get the real exponent with an invalid adjusted_mantissa. +constexpr static int32_t invalid_am_bias = -0x8000; + +// used for binary_format_lookup_tables::max_mantissa +constexpr uint64_t constant_55555 = 5 * 5 * 5 * 5 * 5; + +template struct binary_format_lookup_tables; + +template struct binary_format : binary_format_lookup_tables { + using equiv_uint = + typename std::conditional::type; + + static inline constexpr int mantissa_explicit_bits(); + static inline constexpr int minimum_exponent(); + static inline constexpr int infinite_power(); + static inline constexpr int sign_index(); + static inline constexpr int + min_exponent_fast_path(); // used when fegetround() == FE_TONEAREST + static inline constexpr int max_exponent_fast_path(); + static inline constexpr int max_exponent_round_to_even(); + static inline constexpr int min_exponent_round_to_even(); + static inline constexpr uint64_t max_mantissa_fast_path(int64_t power); + static inline constexpr uint64_t + max_mantissa_fast_path(); // used when fegetround() == FE_TONEAREST + static inline constexpr int largest_power_of_ten(); + static inline constexpr int smallest_power_of_ten(); + static inline constexpr T exact_power_of_ten(int64_t power); + static inline constexpr size_t max_digits(); + static inline constexpr equiv_uint exponent_mask(); + static inline constexpr equiv_uint mantissa_mask(); + static inline constexpr equiv_uint hidden_bit_mask(); +}; + +template struct binary_format_lookup_tables { + static constexpr double powers_of_ten[] = { + 1e0, 1e1, 1e2, 1e3, 1e4, 1e5, 1e6, 1e7, 1e8, 1e9, 1e10, 1e11, + 1e12, 1e13, 1e14, 1e15, 1e16, 1e17, 1e18, 1e19, 1e20, 1e21, 1e22}; + + // Largest integer value v so that (5**index * v) <= 1<<53. + // 0x20000000000000 == 1 << 53 + static constexpr uint64_t max_mantissa[] = { + 0x20000000000000, + 0x20000000000000 / 5, + 0x20000000000000 / (5 * 5), + 0x20000000000000 / (5 * 5 * 5), + 0x20000000000000 / (5 * 5 * 5 * 5), + 0x20000000000000 / (constant_55555), + 0x20000000000000 / (constant_55555 * 5), + 0x20000000000000 / (constant_55555 * 5 * 5), + 0x20000000000000 / (constant_55555 * 5 * 5 * 5), + 0x20000000000000 / (constant_55555 * 5 * 5 * 5 * 5), + 0x20000000000000 / (constant_55555 * constant_55555), + 0x20000000000000 / (constant_55555 * constant_55555 * 5), + 0x20000000000000 / (constant_55555 * constant_55555 * 5 * 5), + 0x20000000000000 / (constant_55555 * constant_55555 * 5 * 5 * 5), + 0x20000000000000 / (constant_55555 * constant_55555 * constant_55555), + 0x20000000000000 / (constant_55555 * constant_55555 * constant_55555 * 5), + 0x20000000000000 / + (constant_55555 * constant_55555 * constant_55555 * 5 * 5), + 0x20000000000000 / + (constant_55555 * constant_55555 * constant_55555 * 5 * 5 * 5), + 0x20000000000000 / + (constant_55555 * constant_55555 * constant_55555 * 5 * 5 * 5 * 5), + 0x20000000000000 / + (constant_55555 * constant_55555 * constant_55555 * constant_55555), + 0x20000000000000 / (constant_55555 * constant_55555 * constant_55555 * + constant_55555 * 5), + 0x20000000000000 / (constant_55555 * constant_55555 * constant_55555 * + constant_55555 * 5 * 5), + 0x20000000000000 / (constant_55555 * constant_55555 * constant_55555 * + constant_55555 * 5 * 5 * 5), + 0x20000000000000 / (constant_55555 * constant_55555 * constant_55555 * + constant_55555 * 5 * 5 * 5 * 5)}; +}; + +#if FASTFLOAT_DETAIL_MUST_DEFINE_CONSTEXPR_VARIABLE + +template +constexpr double binary_format_lookup_tables::powers_of_ten[]; + +template +constexpr uint64_t binary_format_lookup_tables::max_mantissa[]; + +#endif + +template struct binary_format_lookup_tables { + static constexpr float powers_of_ten[] = {1e0f, 1e1f, 1e2f, 1e3f, 1e4f, 1e5f, + 1e6f, 1e7f, 1e8f, 1e9f, 1e10f}; + + // Largest integer value v so that (5**index * v) <= 1<<24. + // 0x1000000 == 1<<24 + static constexpr uint64_t max_mantissa[] = { + 0x1000000, + 0x1000000 / 5, + 0x1000000 / (5 * 5), + 0x1000000 / (5 * 5 * 5), + 0x1000000 / (5 * 5 * 5 * 5), + 0x1000000 / (constant_55555), + 0x1000000 / (constant_55555 * 5), + 0x1000000 / (constant_55555 * 5 * 5), + 0x1000000 / (constant_55555 * 5 * 5 * 5), + 0x1000000 / (constant_55555 * 5 * 5 * 5 * 5), + 0x1000000 / (constant_55555 * constant_55555), + 0x1000000 / (constant_55555 * constant_55555 * 5)}; +}; + +#if FASTFLOAT_DETAIL_MUST_DEFINE_CONSTEXPR_VARIABLE + +template +constexpr float binary_format_lookup_tables::powers_of_ten[]; + +template +constexpr uint64_t binary_format_lookup_tables::max_mantissa[]; + +#endif + +template <> +inline constexpr int binary_format::min_exponent_fast_path() { +#if (FLT_EVAL_METHOD != 1) && (FLT_EVAL_METHOD != 0) + return 0; +#else + return -22; +#endif +} + +template <> +inline constexpr int binary_format::min_exponent_fast_path() { +#if (FLT_EVAL_METHOD != 1) && (FLT_EVAL_METHOD != 0) + return 0; +#else + return -10; +#endif +} + +template <> +inline constexpr int binary_format::mantissa_explicit_bits() { + return 52; +} +template <> +inline constexpr int binary_format::mantissa_explicit_bits() { + return 23; +} + +template <> +inline constexpr int binary_format::max_exponent_round_to_even() { + return 23; +} + +template <> +inline constexpr int binary_format::max_exponent_round_to_even() { + return 10; +} + +template <> +inline constexpr int binary_format::min_exponent_round_to_even() { + return -4; +} + +template <> +inline constexpr int binary_format::min_exponent_round_to_even() { + return -17; +} + +template <> inline constexpr int binary_format::minimum_exponent() { + return -1023; +} +template <> inline constexpr int binary_format::minimum_exponent() { + return -127; +} + +template <> inline constexpr int binary_format::infinite_power() { + return 0x7FF; +} +template <> inline constexpr int binary_format::infinite_power() { + return 0xFF; +} + +template <> inline constexpr int binary_format::sign_index() { + return 63; +} +template <> inline constexpr int binary_format::sign_index() { + return 31; +} + +template <> +inline constexpr int binary_format::max_exponent_fast_path() { + return 22; +} +template <> +inline constexpr int binary_format::max_exponent_fast_path() { + return 10; +} + +template <> +inline constexpr uint64_t binary_format::max_mantissa_fast_path() { + return uint64_t(2) << mantissa_explicit_bits(); +} +template <> +inline constexpr uint64_t +binary_format::max_mantissa_fast_path(int64_t power) { + // caller is responsible to ensure that + // power >= 0 && power <= 22 + // + // Work around clang bug https://godbolt.org/z/zedh7rrhc + return (void)max_mantissa[0], max_mantissa[power]; +} +template <> +inline constexpr uint64_t binary_format::max_mantissa_fast_path() { + return uint64_t(2) << mantissa_explicit_bits(); +} +template <> +inline constexpr uint64_t +binary_format::max_mantissa_fast_path(int64_t power) { + // caller is responsible to ensure that + // power >= 0 && power <= 10 + // + // Work around clang bug https://godbolt.org/z/zedh7rrhc + return (void)max_mantissa[0], max_mantissa[power]; +} + +template <> +inline constexpr double +binary_format::exact_power_of_ten(int64_t power) { + // Work around clang bug https://godbolt.org/z/zedh7rrhc + return (void)powers_of_ten[0], powers_of_ten[power]; +} +template <> +inline constexpr float binary_format::exact_power_of_ten(int64_t power) { + // Work around clang bug https://godbolt.org/z/zedh7rrhc + return (void)powers_of_ten[0], powers_of_ten[power]; +} + +template <> inline constexpr int binary_format::largest_power_of_ten() { + return 308; +} +template <> inline constexpr int binary_format::largest_power_of_ten() { + return 38; +} + +template <> +inline constexpr int binary_format::smallest_power_of_ten() { + return -342; +} +template <> inline constexpr int binary_format::smallest_power_of_ten() { + return -64; +} + +template <> inline constexpr size_t binary_format::max_digits() { + return 769; +} +template <> inline constexpr size_t binary_format::max_digits() { + return 114; +} + +template <> +inline constexpr binary_format::equiv_uint +binary_format::exponent_mask() { + return 0x7F800000; +} +template <> +inline constexpr binary_format::equiv_uint +binary_format::exponent_mask() { + return 0x7FF0000000000000; +} + +template <> +inline constexpr binary_format::equiv_uint +binary_format::mantissa_mask() { + return 0x007FFFFF; +} +template <> +inline constexpr binary_format::equiv_uint +binary_format::mantissa_mask() { + return 0x000FFFFFFFFFFFFF; +} + +template <> +inline constexpr binary_format::equiv_uint +binary_format::hidden_bit_mask() { + return 0x00800000; +} +template <> +inline constexpr binary_format::equiv_uint +binary_format::hidden_bit_mask() { + return 0x0010000000000000; +} + +template +fastfloat_really_inline FASTFLOAT_CONSTEXPR20 void +to_float(bool negative, adjusted_mantissa am, T &value) { + using fastfloat_uint = typename binary_format::equiv_uint; + fastfloat_uint word = (fastfloat_uint)am.mantissa; + word |= fastfloat_uint(am.power2) + << binary_format::mantissa_explicit_bits(); + word |= fastfloat_uint(negative) << binary_format::sign_index(); +#if FASTFLOAT_HAS_BIT_CAST + value = std::bit_cast(word); +#else + ::memcpy(&value, &word, sizeof(T)); +#endif +} + +#ifdef FASTFLOAT_SKIP_WHITE_SPACE // disabled by default +template struct space_lut { + static constexpr bool value[] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; +}; + +#if FASTFLOAT_DETAIL_MUST_DEFINE_CONSTEXPR_VARIABLE + +template constexpr bool space_lut::value[]; + +#endif + +inline constexpr bool is_space(uint8_t c) { return space_lut<>::value[c]; } +#endif + +template static constexpr uint64_t int_cmp_zeros() { + static_assert((sizeof(UC) == 1) || (sizeof(UC) == 2) || (sizeof(UC) == 4), + "Unsupported character size"); + return (sizeof(UC) == 1) ? 0x3030303030303030 + : (sizeof(UC) == 2) + ? (uint64_t(UC('0')) << 48 | uint64_t(UC('0')) << 32 | + uint64_t(UC('0')) << 16 | UC('0')) + : (uint64_t(UC('0')) << 32 | UC('0')); +} +template static constexpr int int_cmp_len() { + return sizeof(uint64_t) / sizeof(UC); +} +template static constexpr UC const *str_const_nan() { + return nullptr; +} +template <> constexpr char const *str_const_nan() { return "nan"; } +template <> constexpr wchar_t const *str_const_nan() { return L"nan"; } +template <> constexpr char16_t const *str_const_nan() { + return u"nan"; +} +template <> constexpr char32_t const *str_const_nan() { + return U"nan"; +} +template static constexpr UC const *str_const_inf() { + return nullptr; +} +template <> constexpr char const *str_const_inf() { return "infinity"; } +template <> constexpr wchar_t const *str_const_inf() { + return L"infinity"; +} +template <> constexpr char16_t const *str_const_inf() { + return u"infinity"; +} +template <> constexpr char32_t const *str_const_inf() { + return U"infinity"; +} + +template struct int_luts { + static constexpr uint8_t chdigit[] = { + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 255, 255, + 255, 255, 255, 255, 255, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, + 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, + 35, 255, 255, 255, 255, 255, 255, 10, 11, 12, 13, 14, 15, 16, 17, + 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, + 33, 34, 35, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255}; + + static constexpr size_t maxdigits_u64[] = { + 64, 41, 32, 28, 25, 23, 22, 21, 20, 19, 18, 18, 17, 17, 16, 16, 16, 16, + 15, 15, 15, 15, 14, 14, 14, 14, 14, 14, 14, 13, 13, 13, 13, 13, 13}; + + static constexpr uint64_t min_safe_u64[] = { + 9223372036854775808ull, 12157665459056928801ull, 4611686018427387904, + 7450580596923828125, 4738381338321616896, 3909821048582988049, + 9223372036854775808ull, 12157665459056928801ull, 10000000000000000000ull, + 5559917313492231481, 2218611106740436992, 8650415919381337933, + 2177953337809371136, 6568408355712890625, 1152921504606846976, + 2862423051509815793, 6746640616477458432, 15181127029874798299ull, + 1638400000000000000, 3243919932521508681, 6221821273427820544, + 11592836324538749809ull, 876488338465357824, 1490116119384765625, + 2481152873203736576, 4052555153018976267, 6502111422497947648, + 10260628712958602189ull, 15943230000000000000ull, 787662783788549761, + 1152921504606846976, 1667889514952984961, 2386420683693101056, + 3379220508056640625, 4738381338321616896}; +}; + +#if FASTFLOAT_DETAIL_MUST_DEFINE_CONSTEXPR_VARIABLE + +template constexpr uint8_t int_luts::chdigit[]; + +template constexpr size_t int_luts::maxdigits_u64[]; + +template constexpr uint64_t int_luts::min_safe_u64[]; + +#endif + +template +fastfloat_really_inline constexpr uint8_t ch_to_digit(UC c) { + return int_luts<>::chdigit[static_cast(c)]; +} + +fastfloat_really_inline constexpr size_t max_digits_u64(int base) { + return int_luts<>::maxdigits_u64[base - 2]; +} + +// If a u64 is exactly max_digits_u64() in length, this is +// the value below which it has definitely overflowed. +fastfloat_really_inline constexpr uint64_t min_safe_u64(int base) { + return int_luts<>::min_safe_u64[base - 2]; +} + +} // namespace fast_float + +#endif + + +#ifndef FASTFLOAT_FAST_FLOAT_H +#define FASTFLOAT_FAST_FLOAT_H + + +namespace fast_float { +/** + * This function parses the character sequence [first,last) for a number. It + * parses floating-point numbers expecting a locale-indepent format equivalent + * to what is used by std::strtod in the default ("C") locale. The resulting + * floating-point value is the closest floating-point values (using either float + * or double), using the "round to even" convention for values that would + * otherwise fall right in-between two values. That is, we provide exact parsing + * according to the IEEE standard. + * + * Given a successful parse, the pointer (`ptr`) in the returned value is set to + * point right after the parsed number, and the `value` referenced is set to the + * parsed value. In case of error, the returned `ec` contains a representative + * error, otherwise the default (`std::errc()`) value is stored. + * + * The implementation does not throw and does not allocate memory (e.g., with + * `new` or `malloc`). + * + * Like the C++17 standard, the `fast_float::from_chars` functions take an + * optional last argument of the type `fast_float::chars_format`. It is a bitset + * value: we check whether `fmt & fast_float::chars_format::fixed` and `fmt & + * fast_float::chars_format::scientific` are set to determine whether we allow + * the fixed point and scientific notation respectively. The default is + * `fast_float::chars_format::general` which allows both `fixed` and + * `scientific`. + */ +template ())> +FASTFLOAT_CONSTEXPR20 from_chars_result_t +from_chars(UC const *first, UC const *last, T &value, + chars_format fmt = chars_format::general) noexcept; + +/** + * Like from_chars, but accepts an `options` argument to govern number parsing. + */ +template +FASTFLOAT_CONSTEXPR20 from_chars_result_t +from_chars_advanced(UC const *first, UC const *last, T &value, + parse_options_t options) noexcept; +/** + * from_chars for integer types. + */ +template ())> +FASTFLOAT_CONSTEXPR20 from_chars_result_t +from_chars(UC const *first, UC const *last, T &value, int base = 10) noexcept; + +} // namespace fast_float +#endif // FASTFLOAT_FAST_FLOAT_H + +#ifndef FASTFLOAT_ASCII_NUMBER_H +#define FASTFLOAT_ASCII_NUMBER_H + +#include +#include +#include +#include +#include +#include + + +#ifdef FASTFLOAT_SSE2 +#include +#endif + +#ifdef FASTFLOAT_NEON +#include +#endif + +namespace fast_float { + +template fastfloat_really_inline constexpr bool has_simd_opt() { +#ifdef FASTFLOAT_HAS_SIMD + return std::is_same::value; +#else + return false; +#endif +} + +// Next function can be micro-optimized, but compilers are entirely +// able to optimize it well. +template +fastfloat_really_inline constexpr bool is_integer(UC c) noexcept { + return !(c > UC('9') || c < UC('0')); +} + +fastfloat_really_inline constexpr uint64_t byteswap(uint64_t val) { + return (val & 0xFF00000000000000) >> 56 | (val & 0x00FF000000000000) >> 40 | + (val & 0x0000FF0000000000) >> 24 | (val & 0x000000FF00000000) >> 8 | + (val & 0x00000000FF000000) << 8 | (val & 0x0000000000FF0000) << 24 | + (val & 0x000000000000FF00) << 40 | (val & 0x00000000000000FF) << 56; +} + +// Read 8 UC into a u64. Truncates UC if not char. +template +fastfloat_really_inline FASTFLOAT_CONSTEXPR20 uint64_t +read8_to_u64(const UC *chars) { + if (cpp20_and_in_constexpr() || !std::is_same::value) { + uint64_t val = 0; + for (int i = 0; i < 8; ++i) { + val |= uint64_t(uint8_t(*chars)) << (i * 8); + ++chars; + } + return val; + } + uint64_t val; + ::memcpy(&val, chars, sizeof(uint64_t)); +#if FASTFLOAT_IS_BIG_ENDIAN == 1 + // Need to read as-if the number was in little-endian order. + val = byteswap(val); +#endif + return val; +} + +#ifdef FASTFLOAT_SSE2 + +fastfloat_really_inline uint64_t simd_read8_to_u64(const __m128i data) { + FASTFLOAT_SIMD_DISABLE_WARNINGS + const __m128i packed = _mm_packus_epi16(data, data); +#ifdef FASTFLOAT_64BIT + return uint64_t(_mm_cvtsi128_si64(packed)); +#else + uint64_t value; + // Visual Studio + older versions of GCC don't support _mm_storeu_si64 + _mm_storel_epi64(reinterpret_cast<__m128i *>(&value), packed); + return value; +#endif + FASTFLOAT_SIMD_RESTORE_WARNINGS +} + +fastfloat_really_inline uint64_t simd_read8_to_u64(const char16_t *chars) { + FASTFLOAT_SIMD_DISABLE_WARNINGS + return simd_read8_to_u64( + _mm_loadu_si128(reinterpret_cast(chars))); + FASTFLOAT_SIMD_RESTORE_WARNINGS +} + +#elif defined(FASTFLOAT_NEON) + +fastfloat_really_inline uint64_t simd_read8_to_u64(const uint16x8_t data) { + FASTFLOAT_SIMD_DISABLE_WARNINGS + uint8x8_t utf8_packed = vmovn_u16(data); + return vget_lane_u64(vreinterpret_u64_u8(utf8_packed), 0); + FASTFLOAT_SIMD_RESTORE_WARNINGS +} + +fastfloat_really_inline uint64_t simd_read8_to_u64(const char16_t *chars) { + FASTFLOAT_SIMD_DISABLE_WARNINGS + return simd_read8_to_u64( + vld1q_u16(reinterpret_cast(chars))); + FASTFLOAT_SIMD_RESTORE_WARNINGS +} + +#endif // FASTFLOAT_SSE2 + +// MSVC SFINAE is broken pre-VS2017 +#if defined(_MSC_VER) && _MSC_VER <= 1900 +template +#else +template ()) = 0> +#endif +// dummy for compile +uint64_t simd_read8_to_u64(UC const *) { + return 0; +} + +// credit @aqrit +fastfloat_really_inline FASTFLOAT_CONSTEXPR14 uint32_t +parse_eight_digits_unrolled(uint64_t val) { + const uint64_t mask = 0x000000FF000000FF; + const uint64_t mul1 = 0x000F424000000064; // 100 + (1000000ULL << 32) + const uint64_t mul2 = 0x0000271000000001; // 1 + (10000ULL << 32) + val -= 0x3030303030303030; + val = (val * 10) + (val >> 8); // val = (val * 2561) >> 8; + val = (((val & mask) * mul1) + (((val >> 16) & mask) * mul2)) >> 32; + return uint32_t(val); +} + +// Call this if chars are definitely 8 digits. +template +fastfloat_really_inline FASTFLOAT_CONSTEXPR20 uint32_t +parse_eight_digits_unrolled(UC const *chars) noexcept { + if (cpp20_and_in_constexpr() || !has_simd_opt()) { + return parse_eight_digits_unrolled(read8_to_u64(chars)); // truncation okay + } + return parse_eight_digits_unrolled(simd_read8_to_u64(chars)); +} + +// credit @aqrit +fastfloat_really_inline constexpr bool +is_made_of_eight_digits_fast(uint64_t val) noexcept { + return !((((val + 0x4646464646464646) | (val - 0x3030303030303030)) & + 0x8080808080808080)); +} + +#ifdef FASTFLOAT_HAS_SIMD + +// Call this if chars might not be 8 digits. +// Using this style (instead of is_made_of_eight_digits_fast() then +// parse_eight_digits_unrolled()) ensures we don't load SIMD registers twice. +fastfloat_really_inline FASTFLOAT_CONSTEXPR20 bool +simd_parse_if_eight_digits_unrolled(const char16_t *chars, + uint64_t &i) noexcept { + if (cpp20_and_in_constexpr()) { + return false; + } +#ifdef FASTFLOAT_SSE2 + FASTFLOAT_SIMD_DISABLE_WARNINGS + const __m128i data = + _mm_loadu_si128(reinterpret_cast(chars)); + + // (x - '0') <= 9 + // http://0x80.pl/articles/simd-parsing-int-sequences.html + const __m128i t0 = _mm_add_epi16(data, _mm_set1_epi16(32720)); + const __m128i t1 = _mm_cmpgt_epi16(t0, _mm_set1_epi16(-32759)); + + if (_mm_movemask_epi8(t1) == 0) { + i = i * 100000000 + parse_eight_digits_unrolled(simd_read8_to_u64(data)); + return true; + } else + return false; + FASTFLOAT_SIMD_RESTORE_WARNINGS +#elif defined(FASTFLOAT_NEON) + FASTFLOAT_SIMD_DISABLE_WARNINGS + const uint16x8_t data = vld1q_u16(reinterpret_cast(chars)); + + // (x - '0') <= 9 + // http://0x80.pl/articles/simd-parsing-int-sequences.html + const uint16x8_t t0 = vsubq_u16(data, vmovq_n_u16('0')); + const uint16x8_t mask = vcltq_u16(t0, vmovq_n_u16('9' - '0' + 1)); + + if (vminvq_u16(mask) == 0xFFFF) { + i = i * 100000000 + parse_eight_digits_unrolled(simd_read8_to_u64(data)); + return true; + } else + return false; + FASTFLOAT_SIMD_RESTORE_WARNINGS +#else + (void)chars; + (void)i; + return false; +#endif // FASTFLOAT_SSE2 +} + +#endif // FASTFLOAT_HAS_SIMD + +// MSVC SFINAE is broken pre-VS2017 +#if defined(_MSC_VER) && _MSC_VER <= 1900 +template +#else +template ()) = 0> +#endif +// dummy for compile +bool simd_parse_if_eight_digits_unrolled(UC const *, uint64_t &) { + return 0; +} + +template ::value) = 0> +fastfloat_really_inline FASTFLOAT_CONSTEXPR20 void +loop_parse_if_eight_digits(const UC *&p, const UC *const pend, uint64_t &i) { + if (!has_simd_opt()) { + return; + } + while ((std::distance(p, pend) >= 8) && + simd_parse_if_eight_digits_unrolled( + p, i)) { // in rare cases, this will overflow, but that's ok + p += 8; + } +} + +fastfloat_really_inline FASTFLOAT_CONSTEXPR20 void +loop_parse_if_eight_digits(const char *&p, const char *const pend, + uint64_t &i) { + // optimizes better than parse_if_eight_digits_unrolled() for UC = char. + while ((std::distance(p, pend) >= 8) && + is_made_of_eight_digits_fast(read8_to_u64(p))) { + i = i * 100000000 + + parse_eight_digits_unrolled(read8_to_u64( + p)); // in rare cases, this will overflow, but that's ok + p += 8; + } +} + +enum class parse_error { + no_error, + // [JSON-only] The minus sign must be followed by an integer. + missing_integer_after_sign, + // A sign must be followed by an integer or dot. + missing_integer_or_dot_after_sign, + // [JSON-only] The integer part must not have leading zeros. + leading_zeros_in_integer_part, + // [JSON-only] The integer part must have at least one digit. + no_digits_in_integer_part, + // [JSON-only] If there is a decimal point, there must be digits in the + // fractional part. + no_digits_in_fractional_part, + // The mantissa must have at least one digit. + no_digits_in_mantissa, + // Scientific notation requires an exponential part. + missing_exponential_part, +}; + +template struct parsed_number_string_t { + int64_t exponent{0}; + uint64_t mantissa{0}; + UC const *lastmatch{nullptr}; + bool negative{false}; + bool valid{false}; + bool too_many_digits{false}; + // contains the range of the significant digits + span integer{}; // non-nullable + span fraction{}; // nullable + parse_error error{parse_error::no_error}; +}; + +using byte_span = span; +using parsed_number_string = parsed_number_string_t; + +template +fastfloat_really_inline FASTFLOAT_CONSTEXPR20 parsed_number_string_t +report_parse_error(UC const *p, parse_error error) { + parsed_number_string_t answer; + answer.valid = false; + answer.lastmatch = p; + answer.error = error; + return answer; +} + +// Assuming that you use no more than 19 digits, this will +// parse an ASCII string. +template +fastfloat_really_inline FASTFLOAT_CONSTEXPR20 parsed_number_string_t +parse_number_string(UC const *p, UC const *pend, + parse_options_t options) noexcept { + chars_format const fmt = options.format; + UC const decimal_point = options.decimal_point; + + parsed_number_string_t answer; + answer.valid = false; + answer.too_many_digits = false; + answer.negative = (*p == UC('-')); +#ifdef FASTFLOAT_ALLOWS_LEADING_PLUS // disabled by default + if ((*p == UC('-')) || (!(fmt & FASTFLOAT_JSONFMT) && *p == UC('+'))) { +#else + if (*p == UC('-')) { // C++17 20.19.3.(7.1) explicitly forbids '+' sign here +#endif + ++p; + if (p == pend) { + return report_parse_error( + p, parse_error::missing_integer_or_dot_after_sign); + } + if (fmt & FASTFLOAT_JSONFMT) { + if (!is_integer(*p)) { // a sign must be followed by an integer + return report_parse_error(p, + parse_error::missing_integer_after_sign); + } + } else { + if (!is_integer(*p) && + (*p != + decimal_point)) { // a sign must be followed by an integer or the dot + return report_parse_error( + p, parse_error::missing_integer_or_dot_after_sign); + } + } + } + UC const *const start_digits = p; + + uint64_t i = 0; // an unsigned int avoids signed overflows (which are bad) + + while ((p != pend) && is_integer(*p)) { + // a multiplication by 10 is cheaper than an arbitrary integer + // multiplication + i = 10 * i + + uint64_t(*p - + UC('0')); // might overflow, we will handle the overflow later + ++p; + } + UC const *const end_of_integer_part = p; + int64_t digit_count = int64_t(end_of_integer_part - start_digits); + answer.integer = span(start_digits, size_t(digit_count)); + if (fmt & FASTFLOAT_JSONFMT) { + // at least 1 digit in integer part, without leading zeros + if (digit_count == 0) { + return report_parse_error(p, parse_error::no_digits_in_integer_part); + } + if ((start_digits[0] == UC('0') && digit_count > 1)) { + return report_parse_error(start_digits, + parse_error::leading_zeros_in_integer_part); + } + } + + int64_t exponent = 0; + const bool has_decimal_point = (p != pend) && (*p == decimal_point); + if (has_decimal_point) { + ++p; + UC const *before = p; + // can occur at most twice without overflowing, but let it occur more, since + // for integers with many digits, digit parsing is the primary bottleneck. + loop_parse_if_eight_digits(p, pend, i); + + while ((p != pend) && is_integer(*p)) { + uint8_t digit = uint8_t(*p - UC('0')); + ++p; + i = i * 10 + digit; // in rare cases, this will overflow, but that's ok + } + exponent = before - p; + answer.fraction = span(before, size_t(p - before)); + digit_count -= exponent; + } + if (fmt & FASTFLOAT_JSONFMT) { + // at least 1 digit in fractional part + if (has_decimal_point && exponent == 0) { + return report_parse_error(p, + parse_error::no_digits_in_fractional_part); + } + } else if (digit_count == + 0) { // we must have encountered at least one integer! + return report_parse_error(p, parse_error::no_digits_in_mantissa); + } + int64_t exp_number = 0; // explicit exponential part + if (((fmt & chars_format::scientific) && (p != pend) && + ((UC('e') == *p) || (UC('E') == *p))) || + ((fmt & FASTFLOAT_FORTRANFMT) && (p != pend) && + ((UC('+') == *p) || (UC('-') == *p) || (UC('d') == *p) || + (UC('D') == *p)))) { + UC const *location_of_e = p; + if ((UC('e') == *p) || (UC('E') == *p) || (UC('d') == *p) || + (UC('D') == *p)) { + ++p; + } + bool neg_exp = false; + if ((p != pend) && (UC('-') == *p)) { + neg_exp = true; + ++p; + } else if ((p != pend) && + (UC('+') == + *p)) { // '+' on exponent is allowed by C++17 20.19.3.(7.1) + ++p; + } + if ((p == pend) || !is_integer(*p)) { + if (!(fmt & chars_format::fixed)) { + // The exponential part is invalid for scientific notation, so it must + // be a trailing token for fixed notation. However, fixed notation is + // disabled, so report a scientific notation error. + return report_parse_error(p, parse_error::missing_exponential_part); + } + // Otherwise, we will be ignoring the 'e'. + p = location_of_e; + } else { + while ((p != pend) && is_integer(*p)) { + uint8_t digit = uint8_t(*p - UC('0')); + if (exp_number < 0x10000000) { + exp_number = 10 * exp_number + digit; + } + ++p; + } + if (neg_exp) { + exp_number = -exp_number; + } + exponent += exp_number; + } + } else { + // If it scientific and not fixed, we have to bail out. + if ((fmt & chars_format::scientific) && !(fmt & chars_format::fixed)) { + return report_parse_error(p, parse_error::missing_exponential_part); + } + } + answer.lastmatch = p; + answer.valid = true; + + // If we frequently had to deal with long strings of digits, + // we could extend our code by using a 128-bit integer instead + // of a 64-bit integer. However, this is uncommon. + // + // We can deal with up to 19 digits. + if (digit_count > 19) { // this is uncommon + // It is possible that the integer had an overflow. + // We have to handle the case where we have 0.0000somenumber. + // We need to be mindful of the case where we only have zeroes... + // E.g., 0.000000000...000. + UC const *start = start_digits; + while ((start != pend) && (*start == UC('0') || *start == decimal_point)) { + if (*start == UC('0')) { + digit_count--; + } + start++; + } + + if (digit_count > 19) { + answer.too_many_digits = true; + // Let us start again, this time, avoiding overflows. + // We don't need to check if is_integer, since we use the + // pre-tokenized spans from above. + i = 0; + p = answer.integer.ptr; + UC const *int_end = p + answer.integer.len(); + const uint64_t minimal_nineteen_digit_integer{1000000000000000000}; + while ((i < minimal_nineteen_digit_integer) && (p != int_end)) { + i = i * 10 + uint64_t(*p - UC('0')); + ++p; + } + if (i >= minimal_nineteen_digit_integer) { // We have a big integers + exponent = end_of_integer_part - p + exp_number; + } else { // We have a value with a fractional component. + p = answer.fraction.ptr; + UC const *frac_end = p + answer.fraction.len(); + while ((i < minimal_nineteen_digit_integer) && (p != frac_end)) { + i = i * 10 + uint64_t(*p - UC('0')); + ++p; + } + exponent = answer.fraction.ptr - p + exp_number; + } + // We have now corrected both exponent and i, to a truncated value + } + } + answer.exponent = exponent; + answer.mantissa = i; + return answer; +} + +template +fastfloat_really_inline FASTFLOAT_CONSTEXPR20 from_chars_result_t +parse_int_string(UC const *p, UC const *pend, T &value, int base) { + from_chars_result_t answer; + + UC const *const first = p; + + bool negative = (*p == UC('-')); + if (!std::is_signed::value && negative) { + answer.ec = std::errc::invalid_argument; + answer.ptr = first; + return answer; + } +#ifdef FASTFLOAT_ALLOWS_LEADING_PLUS // disabled by default + if ((*p == UC('-')) || (*p == UC('+'))) { +#else + if (*p == UC('-')) { +#endif + ++p; + } + + UC const *const start_num = p; + + while (p != pend && *p == UC('0')) { + ++p; + } + + const bool has_leading_zeros = p > start_num; + + UC const *const start_digits = p; + + uint64_t i = 0; + if (base == 10) { + loop_parse_if_eight_digits(p, pend, i); // use SIMD if possible + } + while (p != pend) { + uint8_t digit = ch_to_digit(*p); + if (digit >= base) { + break; + } + i = uint64_t(base) * i + digit; // might overflow, check this later + p++; + } + + size_t digit_count = size_t(p - start_digits); + + if (digit_count == 0) { + if (has_leading_zeros) { + value = 0; + answer.ec = std::errc(); + answer.ptr = p; + } else { + answer.ec = std::errc::invalid_argument; + answer.ptr = first; + } + return answer; + } + + answer.ptr = p; + + // check u64 overflow + size_t max_digits = max_digits_u64(base); + if (digit_count > max_digits) { + answer.ec = std::errc::result_out_of_range; + return answer; + } + // this check can be eliminated for all other types, but they will all require + // a max_digits(base) equivalent + if (digit_count == max_digits && i < min_safe_u64(base)) { + answer.ec = std::errc::result_out_of_range; + return answer; + } + + // check other types overflow + if (!std::is_same::value) { + if (i > uint64_t(std::numeric_limits::max()) + uint64_t(negative)) { + answer.ec = std::errc::result_out_of_range; + return answer; + } + } + + if (negative) { +#ifdef FASTFLOAT_VISUAL_STUDIO +#pragma warning(push) +#pragma warning(disable : 4146) +#endif + // this weird workaround is required because: + // - converting unsigned to signed when its value is greater than signed max + // is UB pre-C++23. + // - reinterpret_casting (~i + 1) would work, but it is not constexpr + // this is always optimized into a neg instruction (note: T is an integer + // type) + value = T(-std::numeric_limits::max() - + T(i - uint64_t(std::numeric_limits::max()))); +#ifdef FASTFLOAT_VISUAL_STUDIO +#pragma warning(pop) +#endif + } else { + value = T(i); + } + + answer.ec = std::errc(); + return answer; +} + +} // namespace fast_float + +#endif + +#ifndef FASTFLOAT_FAST_TABLE_H +#define FASTFLOAT_FAST_TABLE_H + +#include + +namespace fast_float { + +/** + * When mapping numbers from decimal to binary, + * we go from w * 10^q to m * 2^p but we have + * 10^q = 5^q * 2^q, so effectively + * we are trying to match + * w * 2^q * 5^q to m * 2^p. Thus the powers of two + * are not a concern since they can be represented + * exactly using the binary notation, only the powers of five + * affect the binary significand. + */ + +/** + * The smallest non-zero float (binary64) is 2^-1074. + * We take as input numbers of the form w x 10^q where w < 2^64. + * We have that w * 10^-343 < 2^(64-344) 5^-343 < 2^-1076. + * However, we have that + * (2^64-1) * 10^-342 = (2^64-1) * 2^-342 * 5^-342 > 2^-1074. + * Thus it is possible for a number of the form w * 10^-342 where + * w is a 64-bit value to be a non-zero floating-point number. + ********* + * Any number of form w * 10^309 where w>= 1 is going to be + * infinite in binary64 so we never need to worry about powers + * of 5 greater than 308. + */ +template struct powers_template { + + constexpr static int smallest_power_of_five = + binary_format::smallest_power_of_ten(); + constexpr static int largest_power_of_five = + binary_format::largest_power_of_ten(); + constexpr static int number_of_entries = + 2 * (largest_power_of_five - smallest_power_of_five + 1); + // Powers of five from 5^-342 all the way to 5^308 rounded toward one. + constexpr static uint64_t power_of_five_128[number_of_entries] = { + 0xeef453d6923bd65a, 0x113faa2906a13b3f, + 0x9558b4661b6565f8, 0x4ac7ca59a424c507, + 0xbaaee17fa23ebf76, 0x5d79bcf00d2df649, + 0xe95a99df8ace6f53, 0xf4d82c2c107973dc, + 0x91d8a02bb6c10594, 0x79071b9b8a4be869, + 0xb64ec836a47146f9, 0x9748e2826cdee284, + 0xe3e27a444d8d98b7, 0xfd1b1b2308169b25, + 0x8e6d8c6ab0787f72, 0xfe30f0f5e50e20f7, + 0xb208ef855c969f4f, 0xbdbd2d335e51a935, + 0xde8b2b66b3bc4723, 0xad2c788035e61382, + 0x8b16fb203055ac76, 0x4c3bcb5021afcc31, + 0xaddcb9e83c6b1793, 0xdf4abe242a1bbf3d, + 0xd953e8624b85dd78, 0xd71d6dad34a2af0d, + 0x87d4713d6f33aa6b, 0x8672648c40e5ad68, + 0xa9c98d8ccb009506, 0x680efdaf511f18c2, + 0xd43bf0effdc0ba48, 0x212bd1b2566def2, + 0x84a57695fe98746d, 0x14bb630f7604b57, + 0xa5ced43b7e3e9188, 0x419ea3bd35385e2d, + 0xcf42894a5dce35ea, 0x52064cac828675b9, + 0x818995ce7aa0e1b2, 0x7343efebd1940993, + 0xa1ebfb4219491a1f, 0x1014ebe6c5f90bf8, + 0xca66fa129f9b60a6, 0xd41a26e077774ef6, + 0xfd00b897478238d0, 0x8920b098955522b4, + 0x9e20735e8cb16382, 0x55b46e5f5d5535b0, + 0xc5a890362fddbc62, 0xeb2189f734aa831d, + 0xf712b443bbd52b7b, 0xa5e9ec7501d523e4, + 0x9a6bb0aa55653b2d, 0x47b233c92125366e, + 0xc1069cd4eabe89f8, 0x999ec0bb696e840a, + 0xf148440a256e2c76, 0xc00670ea43ca250d, + 0x96cd2a865764dbca, 0x380406926a5e5728, + 0xbc807527ed3e12bc, 0xc605083704f5ecf2, + 0xeba09271e88d976b, 0xf7864a44c633682e, + 0x93445b8731587ea3, 0x7ab3ee6afbe0211d, + 0xb8157268fdae9e4c, 0x5960ea05bad82964, + 0xe61acf033d1a45df, 0x6fb92487298e33bd, + 0x8fd0c16206306bab, 0xa5d3b6d479f8e056, + 0xb3c4f1ba87bc8696, 0x8f48a4899877186c, + 0xe0b62e2929aba83c, 0x331acdabfe94de87, + 0x8c71dcd9ba0b4925, 0x9ff0c08b7f1d0b14, + 0xaf8e5410288e1b6f, 0x7ecf0ae5ee44dd9, + 0xdb71e91432b1a24a, 0xc9e82cd9f69d6150, + 0x892731ac9faf056e, 0xbe311c083a225cd2, + 0xab70fe17c79ac6ca, 0x6dbd630a48aaf406, + 0xd64d3d9db981787d, 0x92cbbccdad5b108, + 0x85f0468293f0eb4e, 0x25bbf56008c58ea5, + 0xa76c582338ed2621, 0xaf2af2b80af6f24e, + 0xd1476e2c07286faa, 0x1af5af660db4aee1, + 0x82cca4db847945ca, 0x50d98d9fc890ed4d, + 0xa37fce126597973c, 0xe50ff107bab528a0, + 0xcc5fc196fefd7d0c, 0x1e53ed49a96272c8, + 0xff77b1fcbebcdc4f, 0x25e8e89c13bb0f7a, + 0x9faacf3df73609b1, 0x77b191618c54e9ac, + 0xc795830d75038c1d, 0xd59df5b9ef6a2417, + 0xf97ae3d0d2446f25, 0x4b0573286b44ad1d, + 0x9becce62836ac577, 0x4ee367f9430aec32, + 0xc2e801fb244576d5, 0x229c41f793cda73f, + 0xf3a20279ed56d48a, 0x6b43527578c1110f, + 0x9845418c345644d6, 0x830a13896b78aaa9, + 0xbe5691ef416bd60c, 0x23cc986bc656d553, + 0xedec366b11c6cb8f, 0x2cbfbe86b7ec8aa8, + 0x94b3a202eb1c3f39, 0x7bf7d71432f3d6a9, + 0xb9e08a83a5e34f07, 0xdaf5ccd93fb0cc53, + 0xe858ad248f5c22c9, 0xd1b3400f8f9cff68, + 0x91376c36d99995be, 0x23100809b9c21fa1, + 0xb58547448ffffb2d, 0xabd40a0c2832a78a, + 0xe2e69915b3fff9f9, 0x16c90c8f323f516c, + 0x8dd01fad907ffc3b, 0xae3da7d97f6792e3, + 0xb1442798f49ffb4a, 0x99cd11cfdf41779c, + 0xdd95317f31c7fa1d, 0x40405643d711d583, + 0x8a7d3eef7f1cfc52, 0x482835ea666b2572, + 0xad1c8eab5ee43b66, 0xda3243650005eecf, + 0xd863b256369d4a40, 0x90bed43e40076a82, + 0x873e4f75e2224e68, 0x5a7744a6e804a291, + 0xa90de3535aaae202, 0x711515d0a205cb36, + 0xd3515c2831559a83, 0xd5a5b44ca873e03, + 0x8412d9991ed58091, 0xe858790afe9486c2, + 0xa5178fff668ae0b6, 0x626e974dbe39a872, + 0xce5d73ff402d98e3, 0xfb0a3d212dc8128f, + 0x80fa687f881c7f8e, 0x7ce66634bc9d0b99, + 0xa139029f6a239f72, 0x1c1fffc1ebc44e80, + 0xc987434744ac874e, 0xa327ffb266b56220, + 0xfbe9141915d7a922, 0x4bf1ff9f0062baa8, + 0x9d71ac8fada6c9b5, 0x6f773fc3603db4a9, + 0xc4ce17b399107c22, 0xcb550fb4384d21d3, + 0xf6019da07f549b2b, 0x7e2a53a146606a48, + 0x99c102844f94e0fb, 0x2eda7444cbfc426d, + 0xc0314325637a1939, 0xfa911155fefb5308, + 0xf03d93eebc589f88, 0x793555ab7eba27ca, + 0x96267c7535b763b5, 0x4bc1558b2f3458de, + 0xbbb01b9283253ca2, 0x9eb1aaedfb016f16, + 0xea9c227723ee8bcb, 0x465e15a979c1cadc, + 0x92a1958a7675175f, 0xbfacd89ec191ec9, + 0xb749faed14125d36, 0xcef980ec671f667b, + 0xe51c79a85916f484, 0x82b7e12780e7401a, + 0x8f31cc0937ae58d2, 0xd1b2ecb8b0908810, + 0xb2fe3f0b8599ef07, 0x861fa7e6dcb4aa15, + 0xdfbdcece67006ac9, 0x67a791e093e1d49a, + 0x8bd6a141006042bd, 0xe0c8bb2c5c6d24e0, + 0xaecc49914078536d, 0x58fae9f773886e18, + 0xda7f5bf590966848, 0xaf39a475506a899e, + 0x888f99797a5e012d, 0x6d8406c952429603, + 0xaab37fd7d8f58178, 0xc8e5087ba6d33b83, + 0xd5605fcdcf32e1d6, 0xfb1e4a9a90880a64, + 0x855c3be0a17fcd26, 0x5cf2eea09a55067f, + 0xa6b34ad8c9dfc06f, 0xf42faa48c0ea481e, + 0xd0601d8efc57b08b, 0xf13b94daf124da26, + 0x823c12795db6ce57, 0x76c53d08d6b70858, + 0xa2cb1717b52481ed, 0x54768c4b0c64ca6e, + 0xcb7ddcdda26da268, 0xa9942f5dcf7dfd09, + 0xfe5d54150b090b02, 0xd3f93b35435d7c4c, + 0x9efa548d26e5a6e1, 0xc47bc5014a1a6daf, + 0xc6b8e9b0709f109a, 0x359ab6419ca1091b, + 0xf867241c8cc6d4c0, 0xc30163d203c94b62, + 0x9b407691d7fc44f8, 0x79e0de63425dcf1d, + 0xc21094364dfb5636, 0x985915fc12f542e4, + 0xf294b943e17a2bc4, 0x3e6f5b7b17b2939d, + 0x979cf3ca6cec5b5a, 0xa705992ceecf9c42, + 0xbd8430bd08277231, 0x50c6ff782a838353, + 0xece53cec4a314ebd, 0xa4f8bf5635246428, + 0x940f4613ae5ed136, 0x871b7795e136be99, + 0xb913179899f68584, 0x28e2557b59846e3f, + 0xe757dd7ec07426e5, 0x331aeada2fe589cf, + 0x9096ea6f3848984f, 0x3ff0d2c85def7621, + 0xb4bca50b065abe63, 0xfed077a756b53a9, + 0xe1ebce4dc7f16dfb, 0xd3e8495912c62894, + 0x8d3360f09cf6e4bd, 0x64712dd7abbbd95c, + 0xb080392cc4349dec, 0xbd8d794d96aacfb3, + 0xdca04777f541c567, 0xecf0d7a0fc5583a0, + 0x89e42caaf9491b60, 0xf41686c49db57244, + 0xac5d37d5b79b6239, 0x311c2875c522ced5, + 0xd77485cb25823ac7, 0x7d633293366b828b, + 0x86a8d39ef77164bc, 0xae5dff9c02033197, + 0xa8530886b54dbdeb, 0xd9f57f830283fdfc, + 0xd267caa862a12d66, 0xd072df63c324fd7b, + 0x8380dea93da4bc60, 0x4247cb9e59f71e6d, + 0xa46116538d0deb78, 0x52d9be85f074e608, + 0xcd795be870516656, 0x67902e276c921f8b, + 0x806bd9714632dff6, 0xba1cd8a3db53b6, + 0xa086cfcd97bf97f3, 0x80e8a40eccd228a4, + 0xc8a883c0fdaf7df0, 0x6122cd128006b2cd, + 0xfad2a4b13d1b5d6c, 0x796b805720085f81, + 0x9cc3a6eec6311a63, 0xcbe3303674053bb0, + 0xc3f490aa77bd60fc, 0xbedbfc4411068a9c, + 0xf4f1b4d515acb93b, 0xee92fb5515482d44, + 0x991711052d8bf3c5, 0x751bdd152d4d1c4a, + 0xbf5cd54678eef0b6, 0xd262d45a78a0635d, + 0xef340a98172aace4, 0x86fb897116c87c34, + 0x9580869f0e7aac0e, 0xd45d35e6ae3d4da0, + 0xbae0a846d2195712, 0x8974836059cca109, + 0xe998d258869facd7, 0x2bd1a438703fc94b, + 0x91ff83775423cc06, 0x7b6306a34627ddcf, + 0xb67f6455292cbf08, 0x1a3bc84c17b1d542, + 0xe41f3d6a7377eeca, 0x20caba5f1d9e4a93, + 0x8e938662882af53e, 0x547eb47b7282ee9c, + 0xb23867fb2a35b28d, 0xe99e619a4f23aa43, + 0xdec681f9f4c31f31, 0x6405fa00e2ec94d4, + 0x8b3c113c38f9f37e, 0xde83bc408dd3dd04, + 0xae0b158b4738705e, 0x9624ab50b148d445, + 0xd98ddaee19068c76, 0x3badd624dd9b0957, + 0x87f8a8d4cfa417c9, 0xe54ca5d70a80e5d6, + 0xa9f6d30a038d1dbc, 0x5e9fcf4ccd211f4c, + 0xd47487cc8470652b, 0x7647c3200069671f, + 0x84c8d4dfd2c63f3b, 0x29ecd9f40041e073, + 0xa5fb0a17c777cf09, 0xf468107100525890, + 0xcf79cc9db955c2cc, 0x7182148d4066eeb4, + 0x81ac1fe293d599bf, 0xc6f14cd848405530, + 0xa21727db38cb002f, 0xb8ada00e5a506a7c, + 0xca9cf1d206fdc03b, 0xa6d90811f0e4851c, + 0xfd442e4688bd304a, 0x908f4a166d1da663, + 0x9e4a9cec15763e2e, 0x9a598e4e043287fe, + 0xc5dd44271ad3cdba, 0x40eff1e1853f29fd, + 0xf7549530e188c128, 0xd12bee59e68ef47c, + 0x9a94dd3e8cf578b9, 0x82bb74f8301958ce, + 0xc13a148e3032d6e7, 0xe36a52363c1faf01, + 0xf18899b1bc3f8ca1, 0xdc44e6c3cb279ac1, + 0x96f5600f15a7b7e5, 0x29ab103a5ef8c0b9, + 0xbcb2b812db11a5de, 0x7415d448f6b6f0e7, + 0xebdf661791d60f56, 0x111b495b3464ad21, + 0x936b9fcebb25c995, 0xcab10dd900beec34, + 0xb84687c269ef3bfb, 0x3d5d514f40eea742, + 0xe65829b3046b0afa, 0xcb4a5a3112a5112, + 0x8ff71a0fe2c2e6dc, 0x47f0e785eaba72ab, + 0xb3f4e093db73a093, 0x59ed216765690f56, + 0xe0f218b8d25088b8, 0x306869c13ec3532c, + 0x8c974f7383725573, 0x1e414218c73a13fb, + 0xafbd2350644eeacf, 0xe5d1929ef90898fa, + 0xdbac6c247d62a583, 0xdf45f746b74abf39, + 0x894bc396ce5da772, 0x6b8bba8c328eb783, + 0xab9eb47c81f5114f, 0x66ea92f3f326564, + 0xd686619ba27255a2, 0xc80a537b0efefebd, + 0x8613fd0145877585, 0xbd06742ce95f5f36, + 0xa798fc4196e952e7, 0x2c48113823b73704, + 0xd17f3b51fca3a7a0, 0xf75a15862ca504c5, + 0x82ef85133de648c4, 0x9a984d73dbe722fb, + 0xa3ab66580d5fdaf5, 0xc13e60d0d2e0ebba, + 0xcc963fee10b7d1b3, 0x318df905079926a8, + 0xffbbcfe994e5c61f, 0xfdf17746497f7052, + 0x9fd561f1fd0f9bd3, 0xfeb6ea8bedefa633, + 0xc7caba6e7c5382c8, 0xfe64a52ee96b8fc0, + 0xf9bd690a1b68637b, 0x3dfdce7aa3c673b0, + 0x9c1661a651213e2d, 0x6bea10ca65c084e, + 0xc31bfa0fe5698db8, 0x486e494fcff30a62, + 0xf3e2f893dec3f126, 0x5a89dba3c3efccfa, + 0x986ddb5c6b3a76b7, 0xf89629465a75e01c, + 0xbe89523386091465, 0xf6bbb397f1135823, + 0xee2ba6c0678b597f, 0x746aa07ded582e2c, + 0x94db483840b717ef, 0xa8c2a44eb4571cdc, + 0xba121a4650e4ddeb, 0x92f34d62616ce413, + 0xe896a0d7e51e1566, 0x77b020baf9c81d17, + 0x915e2486ef32cd60, 0xace1474dc1d122e, + 0xb5b5ada8aaff80b8, 0xd819992132456ba, + 0xe3231912d5bf60e6, 0x10e1fff697ed6c69, + 0x8df5efabc5979c8f, 0xca8d3ffa1ef463c1, + 0xb1736b96b6fd83b3, 0xbd308ff8a6b17cb2, + 0xddd0467c64bce4a0, 0xac7cb3f6d05ddbde, + 0x8aa22c0dbef60ee4, 0x6bcdf07a423aa96b, + 0xad4ab7112eb3929d, 0x86c16c98d2c953c6, + 0xd89d64d57a607744, 0xe871c7bf077ba8b7, + 0x87625f056c7c4a8b, 0x11471cd764ad4972, + 0xa93af6c6c79b5d2d, 0xd598e40d3dd89bcf, + 0xd389b47879823479, 0x4aff1d108d4ec2c3, + 0x843610cb4bf160cb, 0xcedf722a585139ba, + 0xa54394fe1eedb8fe, 0xc2974eb4ee658828, + 0xce947a3da6a9273e, 0x733d226229feea32, + 0x811ccc668829b887, 0x806357d5a3f525f, + 0xa163ff802a3426a8, 0xca07c2dcb0cf26f7, + 0xc9bcff6034c13052, 0xfc89b393dd02f0b5, + 0xfc2c3f3841f17c67, 0xbbac2078d443ace2, + 0x9d9ba7832936edc0, 0xd54b944b84aa4c0d, + 0xc5029163f384a931, 0xa9e795e65d4df11, + 0xf64335bcf065d37d, 0x4d4617b5ff4a16d5, + 0x99ea0196163fa42e, 0x504bced1bf8e4e45, + 0xc06481fb9bcf8d39, 0xe45ec2862f71e1d6, + 0xf07da27a82c37088, 0x5d767327bb4e5a4c, + 0x964e858c91ba2655, 0x3a6a07f8d510f86f, + 0xbbe226efb628afea, 0x890489f70a55368b, + 0xeadab0aba3b2dbe5, 0x2b45ac74ccea842e, + 0x92c8ae6b464fc96f, 0x3b0b8bc90012929d, + 0xb77ada0617e3bbcb, 0x9ce6ebb40173744, + 0xe55990879ddcaabd, 0xcc420a6a101d0515, + 0x8f57fa54c2a9eab6, 0x9fa946824a12232d, + 0xb32df8e9f3546564, 0x47939822dc96abf9, + 0xdff9772470297ebd, 0x59787e2b93bc56f7, + 0x8bfbea76c619ef36, 0x57eb4edb3c55b65a, + 0xaefae51477a06b03, 0xede622920b6b23f1, + 0xdab99e59958885c4, 0xe95fab368e45eced, + 0x88b402f7fd75539b, 0x11dbcb0218ebb414, + 0xaae103b5fcd2a881, 0xd652bdc29f26a119, + 0xd59944a37c0752a2, 0x4be76d3346f0495f, + 0x857fcae62d8493a5, 0x6f70a4400c562ddb, + 0xa6dfbd9fb8e5b88e, 0xcb4ccd500f6bb952, + 0xd097ad07a71f26b2, 0x7e2000a41346a7a7, + 0x825ecc24c873782f, 0x8ed400668c0c28c8, + 0xa2f67f2dfa90563b, 0x728900802f0f32fa, + 0xcbb41ef979346bca, 0x4f2b40a03ad2ffb9, + 0xfea126b7d78186bc, 0xe2f610c84987bfa8, + 0x9f24b832e6b0f436, 0xdd9ca7d2df4d7c9, + 0xc6ede63fa05d3143, 0x91503d1c79720dbb, + 0xf8a95fcf88747d94, 0x75a44c6397ce912a, + 0x9b69dbe1b548ce7c, 0xc986afbe3ee11aba, + 0xc24452da229b021b, 0xfbe85badce996168, + 0xf2d56790ab41c2a2, 0xfae27299423fb9c3, + 0x97c560ba6b0919a5, 0xdccd879fc967d41a, + 0xbdb6b8e905cb600f, 0x5400e987bbc1c920, + 0xed246723473e3813, 0x290123e9aab23b68, + 0x9436c0760c86e30b, 0xf9a0b6720aaf6521, + 0xb94470938fa89bce, 0xf808e40e8d5b3e69, + 0xe7958cb87392c2c2, 0xb60b1d1230b20e04, + 0x90bd77f3483bb9b9, 0xb1c6f22b5e6f48c2, + 0xb4ecd5f01a4aa828, 0x1e38aeb6360b1af3, + 0xe2280b6c20dd5232, 0x25c6da63c38de1b0, + 0x8d590723948a535f, 0x579c487e5a38ad0e, + 0xb0af48ec79ace837, 0x2d835a9df0c6d851, + 0xdcdb1b2798182244, 0xf8e431456cf88e65, + 0x8a08f0f8bf0f156b, 0x1b8e9ecb641b58ff, + 0xac8b2d36eed2dac5, 0xe272467e3d222f3f, + 0xd7adf884aa879177, 0x5b0ed81dcc6abb0f, + 0x86ccbb52ea94baea, 0x98e947129fc2b4e9, + 0xa87fea27a539e9a5, 0x3f2398d747b36224, + 0xd29fe4b18e88640e, 0x8eec7f0d19a03aad, + 0x83a3eeeef9153e89, 0x1953cf68300424ac, + 0xa48ceaaab75a8e2b, 0x5fa8c3423c052dd7, + 0xcdb02555653131b6, 0x3792f412cb06794d, + 0x808e17555f3ebf11, 0xe2bbd88bbee40bd0, + 0xa0b19d2ab70e6ed6, 0x5b6aceaeae9d0ec4, + 0xc8de047564d20a8b, 0xf245825a5a445275, + 0xfb158592be068d2e, 0xeed6e2f0f0d56712, + 0x9ced737bb6c4183d, 0x55464dd69685606b, + 0xc428d05aa4751e4c, 0xaa97e14c3c26b886, + 0xf53304714d9265df, 0xd53dd99f4b3066a8, + 0x993fe2c6d07b7fab, 0xe546a8038efe4029, + 0xbf8fdb78849a5f96, 0xde98520472bdd033, + 0xef73d256a5c0f77c, 0x963e66858f6d4440, + 0x95a8637627989aad, 0xdde7001379a44aa8, + 0xbb127c53b17ec159, 0x5560c018580d5d52, + 0xe9d71b689dde71af, 0xaab8f01e6e10b4a6, + 0x9226712162ab070d, 0xcab3961304ca70e8, + 0xb6b00d69bb55c8d1, 0x3d607b97c5fd0d22, + 0xe45c10c42a2b3b05, 0x8cb89a7db77c506a, + 0x8eb98a7a9a5b04e3, 0x77f3608e92adb242, + 0xb267ed1940f1c61c, 0x55f038b237591ed3, + 0xdf01e85f912e37a3, 0x6b6c46dec52f6688, + 0x8b61313bbabce2c6, 0x2323ac4b3b3da015, + 0xae397d8aa96c1b77, 0xabec975e0a0d081a, + 0xd9c7dced53c72255, 0x96e7bd358c904a21, + 0x881cea14545c7575, 0x7e50d64177da2e54, + 0xaa242499697392d2, 0xdde50bd1d5d0b9e9, + 0xd4ad2dbfc3d07787, 0x955e4ec64b44e864, + 0x84ec3c97da624ab4, 0xbd5af13bef0b113e, + 0xa6274bbdd0fadd61, 0xecb1ad8aeacdd58e, + 0xcfb11ead453994ba, 0x67de18eda5814af2, + 0x81ceb32c4b43fcf4, 0x80eacf948770ced7, + 0xa2425ff75e14fc31, 0xa1258379a94d028d, + 0xcad2f7f5359a3b3e, 0x96ee45813a04330, + 0xfd87b5f28300ca0d, 0x8bca9d6e188853fc, + 0x9e74d1b791e07e48, 0x775ea264cf55347e, + 0xc612062576589dda, 0x95364afe032a819e, + 0xf79687aed3eec551, 0x3a83ddbd83f52205, + 0x9abe14cd44753b52, 0xc4926a9672793543, + 0xc16d9a0095928a27, 0x75b7053c0f178294, + 0xf1c90080baf72cb1, 0x5324c68b12dd6339, + 0x971da05074da7bee, 0xd3f6fc16ebca5e04, + 0xbce5086492111aea, 0x88f4bb1ca6bcf585, + 0xec1e4a7db69561a5, 0x2b31e9e3d06c32e6, + 0x9392ee8e921d5d07, 0x3aff322e62439fd0, + 0xb877aa3236a4b449, 0x9befeb9fad487c3, + 0xe69594bec44de15b, 0x4c2ebe687989a9b4, + 0x901d7cf73ab0acd9, 0xf9d37014bf60a11, + 0xb424dc35095cd80f, 0x538484c19ef38c95, + 0xe12e13424bb40e13, 0x2865a5f206b06fba, + 0x8cbccc096f5088cb, 0xf93f87b7442e45d4, + 0xafebff0bcb24aafe, 0xf78f69a51539d749, + 0xdbe6fecebdedd5be, 0xb573440e5a884d1c, + 0x89705f4136b4a597, 0x31680a88f8953031, + 0xabcc77118461cefc, 0xfdc20d2b36ba7c3e, + 0xd6bf94d5e57a42bc, 0x3d32907604691b4d, + 0x8637bd05af6c69b5, 0xa63f9a49c2c1b110, + 0xa7c5ac471b478423, 0xfcf80dc33721d54, + 0xd1b71758e219652b, 0xd3c36113404ea4a9, + 0x83126e978d4fdf3b, 0x645a1cac083126ea, + 0xa3d70a3d70a3d70a, 0x3d70a3d70a3d70a4, + 0xcccccccccccccccc, 0xcccccccccccccccd, + 0x8000000000000000, 0x0, + 0xa000000000000000, 0x0, + 0xc800000000000000, 0x0, + 0xfa00000000000000, 0x0, + 0x9c40000000000000, 0x0, + 0xc350000000000000, 0x0, + 0xf424000000000000, 0x0, + 0x9896800000000000, 0x0, + 0xbebc200000000000, 0x0, + 0xee6b280000000000, 0x0, + 0x9502f90000000000, 0x0, + 0xba43b74000000000, 0x0, + 0xe8d4a51000000000, 0x0, + 0x9184e72a00000000, 0x0, + 0xb5e620f480000000, 0x0, + 0xe35fa931a0000000, 0x0, + 0x8e1bc9bf04000000, 0x0, + 0xb1a2bc2ec5000000, 0x0, + 0xde0b6b3a76400000, 0x0, + 0x8ac7230489e80000, 0x0, + 0xad78ebc5ac620000, 0x0, + 0xd8d726b7177a8000, 0x0, + 0x878678326eac9000, 0x0, + 0xa968163f0a57b400, 0x0, + 0xd3c21bcecceda100, 0x0, + 0x84595161401484a0, 0x0, + 0xa56fa5b99019a5c8, 0x0, + 0xcecb8f27f4200f3a, 0x0, + 0x813f3978f8940984, 0x4000000000000000, + 0xa18f07d736b90be5, 0x5000000000000000, + 0xc9f2c9cd04674ede, 0xa400000000000000, + 0xfc6f7c4045812296, 0x4d00000000000000, + 0x9dc5ada82b70b59d, 0xf020000000000000, + 0xc5371912364ce305, 0x6c28000000000000, + 0xf684df56c3e01bc6, 0xc732000000000000, + 0x9a130b963a6c115c, 0x3c7f400000000000, + 0xc097ce7bc90715b3, 0x4b9f100000000000, + 0xf0bdc21abb48db20, 0x1e86d40000000000, + 0x96769950b50d88f4, 0x1314448000000000, + 0xbc143fa4e250eb31, 0x17d955a000000000, + 0xeb194f8e1ae525fd, 0x5dcfab0800000000, + 0x92efd1b8d0cf37be, 0x5aa1cae500000000, + 0xb7abc627050305ad, 0xf14a3d9e40000000, + 0xe596b7b0c643c719, 0x6d9ccd05d0000000, + 0x8f7e32ce7bea5c6f, 0xe4820023a2000000, + 0xb35dbf821ae4f38b, 0xdda2802c8a800000, + 0xe0352f62a19e306e, 0xd50b2037ad200000, + 0x8c213d9da502de45, 0x4526f422cc340000, + 0xaf298d050e4395d6, 0x9670b12b7f410000, + 0xdaf3f04651d47b4c, 0x3c0cdd765f114000, + 0x88d8762bf324cd0f, 0xa5880a69fb6ac800, + 0xab0e93b6efee0053, 0x8eea0d047a457a00, + 0xd5d238a4abe98068, 0x72a4904598d6d880, + 0x85a36366eb71f041, 0x47a6da2b7f864750, + 0xa70c3c40a64e6c51, 0x999090b65f67d924, + 0xd0cf4b50cfe20765, 0xfff4b4e3f741cf6d, + 0x82818f1281ed449f, 0xbff8f10e7a8921a4, + 0xa321f2d7226895c7, 0xaff72d52192b6a0d, + 0xcbea6f8ceb02bb39, 0x9bf4f8a69f764490, + 0xfee50b7025c36a08, 0x2f236d04753d5b4, + 0x9f4f2726179a2245, 0x1d762422c946590, + 0xc722f0ef9d80aad6, 0x424d3ad2b7b97ef5, + 0xf8ebad2b84e0d58b, 0xd2e0898765a7deb2, + 0x9b934c3b330c8577, 0x63cc55f49f88eb2f, + 0xc2781f49ffcfa6d5, 0x3cbf6b71c76b25fb, + 0xf316271c7fc3908a, 0x8bef464e3945ef7a, + 0x97edd871cfda3a56, 0x97758bf0e3cbb5ac, + 0xbde94e8e43d0c8ec, 0x3d52eeed1cbea317, + 0xed63a231d4c4fb27, 0x4ca7aaa863ee4bdd, + 0x945e455f24fb1cf8, 0x8fe8caa93e74ef6a, + 0xb975d6b6ee39e436, 0xb3e2fd538e122b44, + 0xe7d34c64a9c85d44, 0x60dbbca87196b616, + 0x90e40fbeea1d3a4a, 0xbc8955e946fe31cd, + 0xb51d13aea4a488dd, 0x6babab6398bdbe41, + 0xe264589a4dcdab14, 0xc696963c7eed2dd1, + 0x8d7eb76070a08aec, 0xfc1e1de5cf543ca2, + 0xb0de65388cc8ada8, 0x3b25a55f43294bcb, + 0xdd15fe86affad912, 0x49ef0eb713f39ebe, + 0x8a2dbf142dfcc7ab, 0x6e3569326c784337, + 0xacb92ed9397bf996, 0x49c2c37f07965404, + 0xd7e77a8f87daf7fb, 0xdc33745ec97be906, + 0x86f0ac99b4e8dafd, 0x69a028bb3ded71a3, + 0xa8acd7c0222311bc, 0xc40832ea0d68ce0c, + 0xd2d80db02aabd62b, 0xf50a3fa490c30190, + 0x83c7088e1aab65db, 0x792667c6da79e0fa, + 0xa4b8cab1a1563f52, 0x577001b891185938, + 0xcde6fd5e09abcf26, 0xed4c0226b55e6f86, + 0x80b05e5ac60b6178, 0x544f8158315b05b4, + 0xa0dc75f1778e39d6, 0x696361ae3db1c721, + 0xc913936dd571c84c, 0x3bc3a19cd1e38e9, + 0xfb5878494ace3a5f, 0x4ab48a04065c723, + 0x9d174b2dcec0e47b, 0x62eb0d64283f9c76, + 0xc45d1df942711d9a, 0x3ba5d0bd324f8394, + 0xf5746577930d6500, 0xca8f44ec7ee36479, + 0x9968bf6abbe85f20, 0x7e998b13cf4e1ecb, + 0xbfc2ef456ae276e8, 0x9e3fedd8c321a67e, + 0xefb3ab16c59b14a2, 0xc5cfe94ef3ea101e, + 0x95d04aee3b80ece5, 0xbba1f1d158724a12, + 0xbb445da9ca61281f, 0x2a8a6e45ae8edc97, + 0xea1575143cf97226, 0xf52d09d71a3293bd, + 0x924d692ca61be758, 0x593c2626705f9c56, + 0xb6e0c377cfa2e12e, 0x6f8b2fb00c77836c, + 0xe498f455c38b997a, 0xb6dfb9c0f956447, + 0x8edf98b59a373fec, 0x4724bd4189bd5eac, + 0xb2977ee300c50fe7, 0x58edec91ec2cb657, + 0xdf3d5e9bc0f653e1, 0x2f2967b66737e3ed, + 0x8b865b215899f46c, 0xbd79e0d20082ee74, + 0xae67f1e9aec07187, 0xecd8590680a3aa11, + 0xda01ee641a708de9, 0xe80e6f4820cc9495, + 0x884134fe908658b2, 0x3109058d147fdcdd, + 0xaa51823e34a7eede, 0xbd4b46f0599fd415, + 0xd4e5e2cdc1d1ea96, 0x6c9e18ac7007c91a, + 0x850fadc09923329e, 0x3e2cf6bc604ddb0, + 0xa6539930bf6bff45, 0x84db8346b786151c, + 0xcfe87f7cef46ff16, 0xe612641865679a63, + 0x81f14fae158c5f6e, 0x4fcb7e8f3f60c07e, + 0xa26da3999aef7749, 0xe3be5e330f38f09d, + 0xcb090c8001ab551c, 0x5cadf5bfd3072cc5, + 0xfdcb4fa002162a63, 0x73d9732fc7c8f7f6, + 0x9e9f11c4014dda7e, 0x2867e7fddcdd9afa, + 0xc646d63501a1511d, 0xb281e1fd541501b8, + 0xf7d88bc24209a565, 0x1f225a7ca91a4226, + 0x9ae757596946075f, 0x3375788de9b06958, + 0xc1a12d2fc3978937, 0x52d6b1641c83ae, + 0xf209787bb47d6b84, 0xc0678c5dbd23a49a, + 0x9745eb4d50ce6332, 0xf840b7ba963646e0, + 0xbd176620a501fbff, 0xb650e5a93bc3d898, + 0xec5d3fa8ce427aff, 0xa3e51f138ab4cebe, + 0x93ba47c980e98cdf, 0xc66f336c36b10137, + 0xb8a8d9bbe123f017, 0xb80b0047445d4184, + 0xe6d3102ad96cec1d, 0xa60dc059157491e5, + 0x9043ea1ac7e41392, 0x87c89837ad68db2f, + 0xb454e4a179dd1877, 0x29babe4598c311fb, + 0xe16a1dc9d8545e94, 0xf4296dd6fef3d67a, + 0x8ce2529e2734bb1d, 0x1899e4a65f58660c, + 0xb01ae745b101e9e4, 0x5ec05dcff72e7f8f, + 0xdc21a1171d42645d, 0x76707543f4fa1f73, + 0x899504ae72497eba, 0x6a06494a791c53a8, + 0xabfa45da0edbde69, 0x487db9d17636892, + 0xd6f8d7509292d603, 0x45a9d2845d3c42b6, + 0x865b86925b9bc5c2, 0xb8a2392ba45a9b2, + 0xa7f26836f282b732, 0x8e6cac7768d7141e, + 0xd1ef0244af2364ff, 0x3207d795430cd926, + 0x8335616aed761f1f, 0x7f44e6bd49e807b8, + 0xa402b9c5a8d3a6e7, 0x5f16206c9c6209a6, + 0xcd036837130890a1, 0x36dba887c37a8c0f, + 0x802221226be55a64, 0xc2494954da2c9789, + 0xa02aa96b06deb0fd, 0xf2db9baa10b7bd6c, + 0xc83553c5c8965d3d, 0x6f92829494e5acc7, + 0xfa42a8b73abbf48c, 0xcb772339ba1f17f9, + 0x9c69a97284b578d7, 0xff2a760414536efb, + 0xc38413cf25e2d70d, 0xfef5138519684aba, + 0xf46518c2ef5b8cd1, 0x7eb258665fc25d69, + 0x98bf2f79d5993802, 0xef2f773ffbd97a61, + 0xbeeefb584aff8603, 0xaafb550ffacfd8fa, + 0xeeaaba2e5dbf6784, 0x95ba2a53f983cf38, + 0x952ab45cfa97a0b2, 0xdd945a747bf26183, + 0xba756174393d88df, 0x94f971119aeef9e4, + 0xe912b9d1478ceb17, 0x7a37cd5601aab85d, + 0x91abb422ccb812ee, 0xac62e055c10ab33a, + 0xb616a12b7fe617aa, 0x577b986b314d6009, + 0xe39c49765fdf9d94, 0xed5a7e85fda0b80b, + 0x8e41ade9fbebc27d, 0x14588f13be847307, + 0xb1d219647ae6b31c, 0x596eb2d8ae258fc8, + 0xde469fbd99a05fe3, 0x6fca5f8ed9aef3bb, + 0x8aec23d680043bee, 0x25de7bb9480d5854, + 0xada72ccc20054ae9, 0xaf561aa79a10ae6a, + 0xd910f7ff28069da4, 0x1b2ba1518094da04, + 0x87aa9aff79042286, 0x90fb44d2f05d0842, + 0xa99541bf57452b28, 0x353a1607ac744a53, + 0xd3fa922f2d1675f2, 0x42889b8997915ce8, + 0x847c9b5d7c2e09b7, 0x69956135febada11, + 0xa59bc234db398c25, 0x43fab9837e699095, + 0xcf02b2c21207ef2e, 0x94f967e45e03f4bb, + 0x8161afb94b44f57d, 0x1d1be0eebac278f5, + 0xa1ba1ba79e1632dc, 0x6462d92a69731732, + 0xca28a291859bbf93, 0x7d7b8f7503cfdcfe, + 0xfcb2cb35e702af78, 0x5cda735244c3d43e, + 0x9defbf01b061adab, 0x3a0888136afa64a7, + 0xc56baec21c7a1916, 0x88aaa1845b8fdd0, + 0xf6c69a72a3989f5b, 0x8aad549e57273d45, + 0x9a3c2087a63f6399, 0x36ac54e2f678864b, + 0xc0cb28a98fcf3c7f, 0x84576a1bb416a7dd, + 0xf0fdf2d3f3c30b9f, 0x656d44a2a11c51d5, + 0x969eb7c47859e743, 0x9f644ae5a4b1b325, + 0xbc4665b596706114, 0x873d5d9f0dde1fee, + 0xeb57ff22fc0c7959, 0xa90cb506d155a7ea, + 0x9316ff75dd87cbd8, 0x9a7f12442d588f2, + 0xb7dcbf5354e9bece, 0xc11ed6d538aeb2f, + 0xe5d3ef282a242e81, 0x8f1668c8a86da5fa, + 0x8fa475791a569d10, 0xf96e017d694487bc, + 0xb38d92d760ec4455, 0x37c981dcc395a9ac, + 0xe070f78d3927556a, 0x85bbe253f47b1417, + 0x8c469ab843b89562, 0x93956d7478ccec8e, + 0xaf58416654a6babb, 0x387ac8d1970027b2, + 0xdb2e51bfe9d0696a, 0x6997b05fcc0319e, + 0x88fcf317f22241e2, 0x441fece3bdf81f03, + 0xab3c2fddeeaad25a, 0xd527e81cad7626c3, + 0xd60b3bd56a5586f1, 0x8a71e223d8d3b074, + 0x85c7056562757456, 0xf6872d5667844e49, + 0xa738c6bebb12d16c, 0xb428f8ac016561db, + 0xd106f86e69d785c7, 0xe13336d701beba52, + 0x82a45b450226b39c, 0xecc0024661173473, + 0xa34d721642b06084, 0x27f002d7f95d0190, + 0xcc20ce9bd35c78a5, 0x31ec038df7b441f4, + 0xff290242c83396ce, 0x7e67047175a15271, + 0x9f79a169bd203e41, 0xf0062c6e984d386, + 0xc75809c42c684dd1, 0x52c07b78a3e60868, + 0xf92e0c3537826145, 0xa7709a56ccdf8a82, + 0x9bbcc7a142b17ccb, 0x88a66076400bb691, + 0xc2abf989935ddbfe, 0x6acff893d00ea435, + 0xf356f7ebf83552fe, 0x583f6b8c4124d43, + 0x98165af37b2153de, 0xc3727a337a8b704a, + 0xbe1bf1b059e9a8d6, 0x744f18c0592e4c5c, + 0xeda2ee1c7064130c, 0x1162def06f79df73, + 0x9485d4d1c63e8be7, 0x8addcb5645ac2ba8, + 0xb9a74a0637ce2ee1, 0x6d953e2bd7173692, + 0xe8111c87c5c1ba99, 0xc8fa8db6ccdd0437, + 0x910ab1d4db9914a0, 0x1d9c9892400a22a2, + 0xb54d5e4a127f59c8, 0x2503beb6d00cab4b, + 0xe2a0b5dc971f303a, 0x2e44ae64840fd61d, + 0x8da471a9de737e24, 0x5ceaecfed289e5d2, + 0xb10d8e1456105dad, 0x7425a83e872c5f47, + 0xdd50f1996b947518, 0xd12f124e28f77719, + 0x8a5296ffe33cc92f, 0x82bd6b70d99aaa6f, + 0xace73cbfdc0bfb7b, 0x636cc64d1001550b, + 0xd8210befd30efa5a, 0x3c47f7e05401aa4e, + 0x8714a775e3e95c78, 0x65acfaec34810a71, + 0xa8d9d1535ce3b396, 0x7f1839a741a14d0d, + 0xd31045a8341ca07c, 0x1ede48111209a050, + 0x83ea2b892091e44d, 0x934aed0aab460432, + 0xa4e4b66b68b65d60, 0xf81da84d5617853f, + 0xce1de40642e3f4b9, 0x36251260ab9d668e, + 0x80d2ae83e9ce78f3, 0xc1d72b7c6b426019, + 0xa1075a24e4421730, 0xb24cf65b8612f81f, + 0xc94930ae1d529cfc, 0xdee033f26797b627, + 0xfb9b7cd9a4a7443c, 0x169840ef017da3b1, + 0x9d412e0806e88aa5, 0x8e1f289560ee864e, + 0xc491798a08a2ad4e, 0xf1a6f2bab92a27e2, + 0xf5b5d7ec8acb58a2, 0xae10af696774b1db, + 0x9991a6f3d6bf1765, 0xacca6da1e0a8ef29, + 0xbff610b0cc6edd3f, 0x17fd090a58d32af3, + 0xeff394dcff8a948e, 0xddfc4b4cef07f5b0, + 0x95f83d0a1fb69cd9, 0x4abdaf101564f98e, + 0xbb764c4ca7a4440f, 0x9d6d1ad41abe37f1, + 0xea53df5fd18d5513, 0x84c86189216dc5ed, + 0x92746b9be2f8552c, 0x32fd3cf5b4e49bb4, + 0xb7118682dbb66a77, 0x3fbc8c33221dc2a1, + 0xe4d5e82392a40515, 0xfabaf3feaa5334a, + 0x8f05b1163ba6832d, 0x29cb4d87f2a7400e, + 0xb2c71d5bca9023f8, 0x743e20e9ef511012, + 0xdf78e4b2bd342cf6, 0x914da9246b255416, + 0x8bab8eefb6409c1a, 0x1ad089b6c2f7548e, + 0xae9672aba3d0c320, 0xa184ac2473b529b1, + 0xda3c0f568cc4f3e8, 0xc9e5d72d90a2741e, + 0x8865899617fb1871, 0x7e2fa67c7a658892, + 0xaa7eebfb9df9de8d, 0xddbb901b98feeab7, + 0xd51ea6fa85785631, 0x552a74227f3ea565, + 0x8533285c936b35de, 0xd53a88958f87275f, + 0xa67ff273b8460356, 0x8a892abaf368f137, + 0xd01fef10a657842c, 0x2d2b7569b0432d85, + 0x8213f56a67f6b29b, 0x9c3b29620e29fc73, + 0xa298f2c501f45f42, 0x8349f3ba91b47b8f, + 0xcb3f2f7642717713, 0x241c70a936219a73, + 0xfe0efb53d30dd4d7, 0xed238cd383aa0110, + 0x9ec95d1463e8a506, 0xf4363804324a40aa, + 0xc67bb4597ce2ce48, 0xb143c6053edcd0d5, + 0xf81aa16fdc1b81da, 0xdd94b7868e94050a, + 0x9b10a4e5e9913128, 0xca7cf2b4191c8326, + 0xc1d4ce1f63f57d72, 0xfd1c2f611f63a3f0, + 0xf24a01a73cf2dccf, 0xbc633b39673c8cec, + 0x976e41088617ca01, 0xd5be0503e085d813, + 0xbd49d14aa79dbc82, 0x4b2d8644d8a74e18, + 0xec9c459d51852ba2, 0xddf8e7d60ed1219e, + 0x93e1ab8252f33b45, 0xcabb90e5c942b503, + 0xb8da1662e7b00a17, 0x3d6a751f3b936243, + 0xe7109bfba19c0c9d, 0xcc512670a783ad4, + 0x906a617d450187e2, 0x27fb2b80668b24c5, + 0xb484f9dc9641e9da, 0xb1f9f660802dedf6, + 0xe1a63853bbd26451, 0x5e7873f8a0396973, + 0x8d07e33455637eb2, 0xdb0b487b6423e1e8, + 0xb049dc016abc5e5f, 0x91ce1a9a3d2cda62, + 0xdc5c5301c56b75f7, 0x7641a140cc7810fb, + 0x89b9b3e11b6329ba, 0xa9e904c87fcb0a9d, + 0xac2820d9623bf429, 0x546345fa9fbdcd44, + 0xd732290fbacaf133, 0xa97c177947ad4095, + 0x867f59a9d4bed6c0, 0x49ed8eabcccc485d, + 0xa81f301449ee8c70, 0x5c68f256bfff5a74, + 0xd226fc195c6a2f8c, 0x73832eec6fff3111, + 0x83585d8fd9c25db7, 0xc831fd53c5ff7eab, + 0xa42e74f3d032f525, 0xba3e7ca8b77f5e55, + 0xcd3a1230c43fb26f, 0x28ce1bd2e55f35eb, + 0x80444b5e7aa7cf85, 0x7980d163cf5b81b3, + 0xa0555e361951c366, 0xd7e105bcc332621f, + 0xc86ab5c39fa63440, 0x8dd9472bf3fefaa7, + 0xfa856334878fc150, 0xb14f98f6f0feb951, + 0x9c935e00d4b9d8d2, 0x6ed1bf9a569f33d3, + 0xc3b8358109e84f07, 0xa862f80ec4700c8, + 0xf4a642e14c6262c8, 0xcd27bb612758c0fa, + 0x98e7e9cccfbd7dbd, 0x8038d51cb897789c, + 0xbf21e44003acdd2c, 0xe0470a63e6bd56c3, + 0xeeea5d5004981478, 0x1858ccfce06cac74, + 0x95527a5202df0ccb, 0xf37801e0c43ebc8, + 0xbaa718e68396cffd, 0xd30560258f54e6ba, + 0xe950df20247c83fd, 0x47c6b82ef32a2069, + 0x91d28b7416cdd27e, 0x4cdc331d57fa5441, + 0xb6472e511c81471d, 0xe0133fe4adf8e952, + 0xe3d8f9e563a198e5, 0x58180fddd97723a6, + 0x8e679c2f5e44ff8f, 0x570f09eaa7ea7648, + }; +}; + +#if FASTFLOAT_DETAIL_MUST_DEFINE_CONSTEXPR_VARIABLE + +template +constexpr uint64_t + powers_template::power_of_five_128[number_of_entries]; + +#endif + +using powers = powers_template<>; + +} // namespace fast_float + +#endif + +#ifndef FASTFLOAT_DECIMAL_TO_BINARY_H +#define FASTFLOAT_DECIMAL_TO_BINARY_H + +#include +#include +#include +#include +#include +#include + +namespace fast_float { + +// This will compute or rather approximate w * 5**q and return a pair of 64-bit +// words approximating the result, with the "high" part corresponding to the +// most significant bits and the low part corresponding to the least significant +// bits. +// +template +fastfloat_really_inline FASTFLOAT_CONSTEXPR20 value128 +compute_product_approximation(int64_t q, uint64_t w) { + const int index = 2 * int(q - powers::smallest_power_of_five); + // For small values of q, e.g., q in [0,27], the answer is always exact + // because The line value128 firstproduct = full_multiplication(w, + // power_of_five_128[index]); gives the exact answer. + value128 firstproduct = + full_multiplication(w, powers::power_of_five_128[index]); + static_assert((bit_precision >= 0) && (bit_precision <= 64), + " precision should be in (0,64]"); + constexpr uint64_t precision_mask = + (bit_precision < 64) ? (uint64_t(0xFFFFFFFFFFFFFFFF) >> bit_precision) + : uint64_t(0xFFFFFFFFFFFFFFFF); + if ((firstproduct.high & precision_mask) == + precision_mask) { // could further guard with (lower + w < lower) + // regarding the second product, we only need secondproduct.high, but our + // expectation is that the compiler will optimize this extra work away if + // needed. + value128 secondproduct = + full_multiplication(w, powers::power_of_five_128[index + 1]); + firstproduct.low += secondproduct.high; + if (secondproduct.high > firstproduct.low) { + firstproduct.high++; + } + } + return firstproduct; +} + +namespace detail { +/** + * For q in (0,350), we have that + * f = (((152170 + 65536) * q ) >> 16); + * is equal to + * floor(p) + q + * where + * p = log(5**q)/log(2) = q * log(5)/log(2) + * + * For negative values of q in (-400,0), we have that + * f = (((152170 + 65536) * q ) >> 16); + * is equal to + * -ceil(p) + q + * where + * p = log(5**-q)/log(2) = -q * log(5)/log(2) + */ +constexpr fastfloat_really_inline int32_t power(int32_t q) noexcept { + return (((152170 + 65536) * q) >> 16) + 63; +} +} // namespace detail + +// create an adjusted mantissa, biased by the invalid power2 +// for significant digits already multiplied by 10 ** q. +template +fastfloat_really_inline FASTFLOAT_CONSTEXPR14 adjusted_mantissa +compute_error_scaled(int64_t q, uint64_t w, int lz) noexcept { + int hilz = int(w >> 63) ^ 1; + adjusted_mantissa answer; + answer.mantissa = w << hilz; + int bias = binary::mantissa_explicit_bits() - binary::minimum_exponent(); + answer.power2 = int32_t(detail::power(int32_t(q)) + bias - hilz - lz - 62 + + invalid_am_bias); + return answer; +} + +// w * 10 ** q, without rounding the representation up. +// the power2 in the exponent will be adjusted by invalid_am_bias. +template +fastfloat_really_inline FASTFLOAT_CONSTEXPR20 adjusted_mantissa +compute_error(int64_t q, uint64_t w) noexcept { + int lz = leading_zeroes(w); + w <<= lz; + value128 product = + compute_product_approximation(q, w); + return compute_error_scaled(q, product.high, lz); +} + +// w * 10 ** q +// The returned value should be a valid ieee64 number that simply need to be +// packed. However, in some very rare cases, the computation will fail. In such +// cases, we return an adjusted_mantissa with a negative power of 2: the caller +// should recompute in such cases. +template +fastfloat_really_inline FASTFLOAT_CONSTEXPR20 adjusted_mantissa +compute_float(int64_t q, uint64_t w) noexcept { + adjusted_mantissa answer; + if ((w == 0) || (q < binary::smallest_power_of_ten())) { + answer.power2 = 0; + answer.mantissa = 0; + // result should be zero + return answer; + } + if (q > binary::largest_power_of_ten()) { + // we want to get infinity: + answer.power2 = binary::infinite_power(); + answer.mantissa = 0; + return answer; + } + // At this point in time q is in [powers::smallest_power_of_five, + // powers::largest_power_of_five]. + + // We want the most significant bit of i to be 1. Shift if needed. + int lz = leading_zeroes(w); + w <<= lz; + + // The required precision is binary::mantissa_explicit_bits() + 3 because + // 1. We need the implicit bit + // 2. We need an extra bit for rounding purposes + // 3. We might lose a bit due to the "upperbit" routine (result too small, + // requiring a shift) + + value128 product = + compute_product_approximation(q, w); + // The computed 'product' is always sufficient. + // Mathematical proof: + // Noble Mushtak and Daniel Lemire, Fast Number Parsing Without Fallback (to + // appear) See script/mushtak_lemire.py + + // The "compute_product_approximation" function can be slightly slower than a + // branchless approach: value128 product = compute_product(q, w); but in + // practice, we can win big with the compute_product_approximation if its + // additional branch is easily predicted. Which is best is data specific. + int upperbit = int(product.high >> 63); + int shift = upperbit + 64 - binary::mantissa_explicit_bits() - 3; + + answer.mantissa = product.high >> shift; + + answer.power2 = int32_t(detail::power(int32_t(q)) + upperbit - lz - + binary::minimum_exponent()); + if (answer.power2 <= 0) { // we have a subnormal? + // Here have that answer.power2 <= 0 so -answer.power2 >= 0 + if (-answer.power2 + 1 >= + 64) { // if we have more than 64 bits below the minimum exponent, you + // have a zero for sure. + answer.power2 = 0; + answer.mantissa = 0; + // result should be zero + return answer; + } + // next line is safe because -answer.power2 + 1 < 64 + answer.mantissa >>= -answer.power2 + 1; + // Thankfully, we can't have both "round-to-even" and subnormals because + // "round-to-even" only occurs for powers close to 0. + answer.mantissa += (answer.mantissa & 1); // round up + answer.mantissa >>= 1; + // There is a weird scenario where we don't have a subnormal but just. + // Suppose we start with 2.2250738585072013e-308, we end up + // with 0x3fffffffffffff x 2^-1023-53 which is technically subnormal + // whereas 0x40000000000000 x 2^-1023-53 is normal. Now, we need to round + // up 0x3fffffffffffff x 2^-1023-53 and once we do, we are no longer + // subnormal, but we can only know this after rounding. + // So we only declare a subnormal if we are smaller than the threshold. + answer.power2 = + (answer.mantissa < (uint64_t(1) << binary::mantissa_explicit_bits())) + ? 0 + : 1; + return answer; + } + + // usually, we round *up*, but if we fall right in between and and we have an + // even basis, we need to round down + // We are only concerned with the cases where 5**q fits in single 64-bit word. + if ((product.low <= 1) && (q >= binary::min_exponent_round_to_even()) && + (q <= binary::max_exponent_round_to_even()) && + ((answer.mantissa & 3) == 1)) { // we may fall between two floats! + // To be in-between two floats we need that in doing + // answer.mantissa = product.high >> (upperbit + 64 - + // binary::mantissa_explicit_bits() - 3); + // ... we dropped out only zeroes. But if this happened, then we can go + // back!!! + if ((answer.mantissa << shift) == product.high) { + answer.mantissa &= ~uint64_t(1); // flip it so that we do not round up + } + } + + answer.mantissa += (answer.mantissa & 1); // round up + answer.mantissa >>= 1; + if (answer.mantissa >= (uint64_t(2) << binary::mantissa_explicit_bits())) { + answer.mantissa = (uint64_t(1) << binary::mantissa_explicit_bits()); + answer.power2++; // undo previous addition + } + + answer.mantissa &= ~(uint64_t(1) << binary::mantissa_explicit_bits()); + if (answer.power2 >= binary::infinite_power()) { // infinity + answer.power2 = binary::infinite_power(); + answer.mantissa = 0; + } + return answer; +} + +} // namespace fast_float + +#endif + +#ifndef FASTFLOAT_BIGINT_H +#define FASTFLOAT_BIGINT_H + +#include +#include +#include +#include + + +namespace fast_float { + +// the limb width: we want efficient multiplication of double the bits in +// limb, or for 64-bit limbs, at least 64-bit multiplication where we can +// extract the high and low parts efficiently. this is every 64-bit +// architecture except for sparc, which emulates 128-bit multiplication. +// we might have platforms where `CHAR_BIT` is not 8, so let's avoid +// doing `8 * sizeof(limb)`. +#if defined(FASTFLOAT_64BIT) && !defined(__sparc) +#define FASTFLOAT_64BIT_LIMB 1 +typedef uint64_t limb; +constexpr size_t limb_bits = 64; +#else +#define FASTFLOAT_32BIT_LIMB +typedef uint32_t limb; +constexpr size_t limb_bits = 32; +#endif + +typedef span limb_span; + +// number of bits in a bigint. this needs to be at least the number +// of bits required to store the largest bigint, which is +// `log2(10**(digits + max_exp))`, or `log2(10**(767 + 342))`, or +// ~3600 bits, so we round to 4000. +constexpr size_t bigint_bits = 4000; +constexpr size_t bigint_limbs = bigint_bits / limb_bits; + +// vector-like type that is allocated on the stack. the entire +// buffer is pre-allocated, and only the length changes. +template struct stackvec { + limb data[size]; + // we never need more than 150 limbs + uint16_t length{0}; + + stackvec() = default; + stackvec(const stackvec &) = delete; + stackvec &operator=(const stackvec &) = delete; + stackvec(stackvec &&) = delete; + stackvec &operator=(stackvec &&other) = delete; + + // create stack vector from existing limb span. + FASTFLOAT_CONSTEXPR20 stackvec(limb_span s) { + FASTFLOAT_ASSERT(try_extend(s)); + } + + FASTFLOAT_CONSTEXPR14 limb &operator[](size_t index) noexcept { + FASTFLOAT_DEBUG_ASSERT(index < length); + return data[index]; + } + FASTFLOAT_CONSTEXPR14 const limb &operator[](size_t index) const noexcept { + FASTFLOAT_DEBUG_ASSERT(index < length); + return data[index]; + } + // index from the end of the container + FASTFLOAT_CONSTEXPR14 const limb &rindex(size_t index) const noexcept { + FASTFLOAT_DEBUG_ASSERT(index < length); + size_t rindex = length - index - 1; + return data[rindex]; + } + + // set the length, without bounds checking. + FASTFLOAT_CONSTEXPR14 void set_len(size_t len) noexcept { + length = uint16_t(len); + } + constexpr size_t len() const noexcept { return length; } + constexpr bool is_empty() const noexcept { return length == 0; } + constexpr size_t capacity() const noexcept { return size; } + // append item to vector, without bounds checking + FASTFLOAT_CONSTEXPR14 void push_unchecked(limb value) noexcept { + data[length] = value; + length++; + } + // append item to vector, returning if item was added + FASTFLOAT_CONSTEXPR14 bool try_push(limb value) noexcept { + if (len() < capacity()) { + push_unchecked(value); + return true; + } else { + return false; + } + } + // add items to the vector, from a span, without bounds checking + FASTFLOAT_CONSTEXPR20 void extend_unchecked(limb_span s) noexcept { + limb *ptr = data + length; + std::copy_n(s.ptr, s.len(), ptr); + set_len(len() + s.len()); + } + // try to add items to the vector, returning if items were added + FASTFLOAT_CONSTEXPR20 bool try_extend(limb_span s) noexcept { + if (len() + s.len() <= capacity()) { + extend_unchecked(s); + return true; + } else { + return false; + } + } + // resize the vector, without bounds checking + // if the new size is longer than the vector, assign value to each + // appended item. + FASTFLOAT_CONSTEXPR20 + void resize_unchecked(size_t new_len, limb value) noexcept { + if (new_len > len()) { + size_t count = new_len - len(); + limb *first = data + len(); + limb *last = first + count; + ::std::fill(first, last, value); + set_len(new_len); + } else { + set_len(new_len); + } + } + // try to resize the vector, returning if the vector was resized. + FASTFLOAT_CONSTEXPR20 bool try_resize(size_t new_len, limb value) noexcept { + if (new_len > capacity()) { + return false; + } else { + resize_unchecked(new_len, value); + return true; + } + } + // check if any limbs are non-zero after the given index. + // this needs to be done in reverse order, since the index + // is relative to the most significant limbs. + FASTFLOAT_CONSTEXPR14 bool nonzero(size_t index) const noexcept { + while (index < len()) { + if (rindex(index) != 0) { + return true; + } + index++; + } + return false; + } + // normalize the big integer, so most-significant zero limbs are removed. + FASTFLOAT_CONSTEXPR14 void normalize() noexcept { + while (len() > 0 && rindex(0) == 0) { + length--; + } + } +}; + +fastfloat_really_inline FASTFLOAT_CONSTEXPR14 uint64_t +empty_hi64(bool &truncated) noexcept { + truncated = false; + return 0; +} + +fastfloat_really_inline FASTFLOAT_CONSTEXPR20 uint64_t +uint64_hi64(uint64_t r0, bool &truncated) noexcept { + truncated = false; + int shl = leading_zeroes(r0); + return r0 << shl; +} + +fastfloat_really_inline FASTFLOAT_CONSTEXPR20 uint64_t +uint64_hi64(uint64_t r0, uint64_t r1, bool &truncated) noexcept { + int shl = leading_zeroes(r0); + if (shl == 0) { + truncated = r1 != 0; + return r0; + } else { + int shr = 64 - shl; + truncated = (r1 << shl) != 0; + return (r0 << shl) | (r1 >> shr); + } +} + +fastfloat_really_inline FASTFLOAT_CONSTEXPR20 uint64_t +uint32_hi64(uint32_t r0, bool &truncated) noexcept { + return uint64_hi64(r0, truncated); +} + +fastfloat_really_inline FASTFLOAT_CONSTEXPR20 uint64_t +uint32_hi64(uint32_t r0, uint32_t r1, bool &truncated) noexcept { + uint64_t x0 = r0; + uint64_t x1 = r1; + return uint64_hi64((x0 << 32) | x1, truncated); +} + +fastfloat_really_inline FASTFLOAT_CONSTEXPR20 uint64_t +uint32_hi64(uint32_t r0, uint32_t r1, uint32_t r2, bool &truncated) noexcept { + uint64_t x0 = r0; + uint64_t x1 = r1; + uint64_t x2 = r2; + return uint64_hi64(x0, (x1 << 32) | x2, truncated); +} + +// add two small integers, checking for overflow. +// we want an efficient operation. for msvc, where +// we don't have built-in intrinsics, this is still +// pretty fast. +fastfloat_really_inline FASTFLOAT_CONSTEXPR20 limb +scalar_add(limb x, limb y, bool &overflow) noexcept { + limb z; +// gcc and clang +#if defined(__has_builtin) +#if __has_builtin(__builtin_add_overflow) + if (!cpp20_and_in_constexpr()) { + overflow = __builtin_add_overflow(x, y, &z); + return z; + } +#endif +#endif + + // generic, this still optimizes correctly on MSVC. + z = x + y; + overflow = z < x; + return z; +} + +// multiply two small integers, getting both the high and low bits. +fastfloat_really_inline FASTFLOAT_CONSTEXPR20 limb +scalar_mul(limb x, limb y, limb &carry) noexcept { +#ifdef FASTFLOAT_64BIT_LIMB +#if defined(__SIZEOF_INT128__) + // GCC and clang both define it as an extension. + __uint128_t z = __uint128_t(x) * __uint128_t(y) + __uint128_t(carry); + carry = limb(z >> limb_bits); + return limb(z); +#else + // fallback, no native 128-bit integer multiplication with carry. + // on msvc, this optimizes identically, somehow. + value128 z = full_multiplication(x, y); + bool overflow; + z.low = scalar_add(z.low, carry, overflow); + z.high += uint64_t(overflow); // cannot overflow + carry = z.high; + return z.low; +#endif +#else + uint64_t z = uint64_t(x) * uint64_t(y) + uint64_t(carry); + carry = limb(z >> limb_bits); + return limb(z); +#endif +} + +// add scalar value to bigint starting from offset. +// used in grade school multiplication +template +inline FASTFLOAT_CONSTEXPR20 bool small_add_from(stackvec &vec, limb y, + size_t start) noexcept { + size_t index = start; + limb carry = y; + bool overflow; + while (carry != 0 && index < vec.len()) { + vec[index] = scalar_add(vec[index], carry, overflow); + carry = limb(overflow); + index += 1; + } + if (carry != 0) { + FASTFLOAT_TRY(vec.try_push(carry)); + } + return true; +} + +// add scalar value to bigint. +template +fastfloat_really_inline FASTFLOAT_CONSTEXPR20 bool +small_add(stackvec &vec, limb y) noexcept { + return small_add_from(vec, y, 0); +} + +// multiply bigint by scalar value. +template +inline FASTFLOAT_CONSTEXPR20 bool small_mul(stackvec &vec, + limb y) noexcept { + limb carry = 0; + for (size_t index = 0; index < vec.len(); index++) { + vec[index] = scalar_mul(vec[index], y, carry); + } + if (carry != 0) { + FASTFLOAT_TRY(vec.try_push(carry)); + } + return true; +} + +// add bigint to bigint starting from index. +// used in grade school multiplication +template +FASTFLOAT_CONSTEXPR20 bool large_add_from(stackvec &x, limb_span y, + size_t start) noexcept { + // the effective x buffer is from `xstart..x.len()`, so exit early + // if we can't get that current range. + if (x.len() < start || y.len() > x.len() - start) { + FASTFLOAT_TRY(x.try_resize(y.len() + start, 0)); + } + + bool carry = false; + for (size_t index = 0; index < y.len(); index++) { + limb xi = x[index + start]; + limb yi = y[index]; + bool c1 = false; + bool c2 = false; + xi = scalar_add(xi, yi, c1); + if (carry) { + xi = scalar_add(xi, 1, c2); + } + x[index + start] = xi; + carry = c1 | c2; + } + + // handle overflow + if (carry) { + FASTFLOAT_TRY(small_add_from(x, 1, y.len() + start)); + } + return true; +} + +// add bigint to bigint. +template +fastfloat_really_inline FASTFLOAT_CONSTEXPR20 bool +large_add_from(stackvec &x, limb_span y) noexcept { + return large_add_from(x, y, 0); +} + +// grade-school multiplication algorithm +template +FASTFLOAT_CONSTEXPR20 bool long_mul(stackvec &x, limb_span y) noexcept { + limb_span xs = limb_span(x.data, x.len()); + stackvec z(xs); + limb_span zs = limb_span(z.data, z.len()); + + if (y.len() != 0) { + limb y0 = y[0]; + FASTFLOAT_TRY(small_mul(x, y0)); + for (size_t index = 1; index < y.len(); index++) { + limb yi = y[index]; + stackvec zi; + if (yi != 0) { + // re-use the same buffer throughout + zi.set_len(0); + FASTFLOAT_TRY(zi.try_extend(zs)); + FASTFLOAT_TRY(small_mul(zi, yi)); + limb_span zis = limb_span(zi.data, zi.len()); + FASTFLOAT_TRY(large_add_from(x, zis, index)); + } + } + } + + x.normalize(); + return true; +} + +// grade-school multiplication algorithm +template +FASTFLOAT_CONSTEXPR20 bool large_mul(stackvec &x, limb_span y) noexcept { + if (y.len() == 1) { + FASTFLOAT_TRY(small_mul(x, y[0])); + } else { + FASTFLOAT_TRY(long_mul(x, y)); + } + return true; +} + +template struct pow5_tables { + static constexpr uint32_t large_step = 135; + static constexpr uint64_t small_power_of_5[] = { + 1UL, + 5UL, + 25UL, + 125UL, + 625UL, + 3125UL, + 15625UL, + 78125UL, + 390625UL, + 1953125UL, + 9765625UL, + 48828125UL, + 244140625UL, + 1220703125UL, + 6103515625UL, + 30517578125UL, + 152587890625UL, + 762939453125UL, + 3814697265625UL, + 19073486328125UL, + 95367431640625UL, + 476837158203125UL, + 2384185791015625UL, + 11920928955078125UL, + 59604644775390625UL, + 298023223876953125UL, + 1490116119384765625UL, + 7450580596923828125UL, + }; +#ifdef FASTFLOAT_64BIT_LIMB + constexpr static limb large_power_of_5[] = { + 1414648277510068013UL, 9180637584431281687UL, 4539964771860779200UL, + 10482974169319127550UL, 198276706040285095UL}; +#else + constexpr static limb large_power_of_5[] = { + 4279965485U, 329373468U, 4020270615U, 2137533757U, 4287402176U, + 1057042919U, 1071430142U, 2440757623U, 381945767U, 46164893U}; +#endif +}; + +#if FASTFLOAT_DETAIL_MUST_DEFINE_CONSTEXPR_VARIABLE + +template constexpr uint32_t pow5_tables::large_step; + +template constexpr uint64_t pow5_tables::small_power_of_5[]; + +template constexpr limb pow5_tables::large_power_of_5[]; + +#endif + +// big integer type. implements a small subset of big integer +// arithmetic, using simple algorithms since asymptotically +// faster algorithms are slower for a small number of limbs. +// all operations assume the big-integer is normalized. +struct bigint : pow5_tables<> { + // storage of the limbs, in little-endian order. + stackvec vec; + + FASTFLOAT_CONSTEXPR20 bigint() : vec() {} + bigint(const bigint &) = delete; + bigint &operator=(const bigint &) = delete; + bigint(bigint &&) = delete; + bigint &operator=(bigint &&other) = delete; + + FASTFLOAT_CONSTEXPR20 bigint(uint64_t value) : vec() { +#ifdef FASTFLOAT_64BIT_LIMB + vec.push_unchecked(value); +#else + vec.push_unchecked(uint32_t(value)); + vec.push_unchecked(uint32_t(value >> 32)); +#endif + vec.normalize(); + } + + // get the high 64 bits from the vector, and if bits were truncated. + // this is to get the significant digits for the float. + FASTFLOAT_CONSTEXPR20 uint64_t hi64(bool &truncated) const noexcept { +#ifdef FASTFLOAT_64BIT_LIMB + if (vec.len() == 0) { + return empty_hi64(truncated); + } else if (vec.len() == 1) { + return uint64_hi64(vec.rindex(0), truncated); + } else { + uint64_t result = uint64_hi64(vec.rindex(0), vec.rindex(1), truncated); + truncated |= vec.nonzero(2); + return result; + } +#else + if (vec.len() == 0) { + return empty_hi64(truncated); + } else if (vec.len() == 1) { + return uint32_hi64(vec.rindex(0), truncated); + } else if (vec.len() == 2) { + return uint32_hi64(vec.rindex(0), vec.rindex(1), truncated); + } else { + uint64_t result = + uint32_hi64(vec.rindex(0), vec.rindex(1), vec.rindex(2), truncated); + truncated |= vec.nonzero(3); + return result; + } +#endif + } + + // compare two big integers, returning the large value. + // assumes both are normalized. if the return value is + // negative, other is larger, if the return value is + // positive, this is larger, otherwise they are equal. + // the limbs are stored in little-endian order, so we + // must compare the limbs in ever order. + FASTFLOAT_CONSTEXPR20 int compare(const bigint &other) const noexcept { + if (vec.len() > other.vec.len()) { + return 1; + } else if (vec.len() < other.vec.len()) { + return -1; + } else { + for (size_t index = vec.len(); index > 0; index--) { + limb xi = vec[index - 1]; + limb yi = other.vec[index - 1]; + if (xi > yi) { + return 1; + } else if (xi < yi) { + return -1; + } + } + return 0; + } + } + + // shift left each limb n bits, carrying over to the new limb + // returns true if we were able to shift all the digits. + FASTFLOAT_CONSTEXPR20 bool shl_bits(size_t n) noexcept { + // Internally, for each item, we shift left by n, and add the previous + // right shifted limb-bits. + // For example, we transform (for u8) shifted left 2, to: + // b10100100 b01000010 + // b10 b10010001 b00001000 + FASTFLOAT_DEBUG_ASSERT(n != 0); + FASTFLOAT_DEBUG_ASSERT(n < sizeof(limb) * 8); + + size_t shl = n; + size_t shr = limb_bits - shl; + limb prev = 0; + for (size_t index = 0; index < vec.len(); index++) { + limb xi = vec[index]; + vec[index] = (xi << shl) | (prev >> shr); + prev = xi; + } + + limb carry = prev >> shr; + if (carry != 0) { + return vec.try_push(carry); + } + return true; + } + + // move the limbs left by `n` limbs. + FASTFLOAT_CONSTEXPR20 bool shl_limbs(size_t n) noexcept { + FASTFLOAT_DEBUG_ASSERT(n != 0); + if (n + vec.len() > vec.capacity()) { + return false; + } else if (!vec.is_empty()) { + // move limbs + limb *dst = vec.data + n; + const limb *src = vec.data; + std::copy_backward(src, src + vec.len(), dst + vec.len()); + // fill in empty limbs + limb *first = vec.data; + limb *last = first + n; + ::std::fill(first, last, 0); + vec.set_len(n + vec.len()); + return true; + } else { + return true; + } + } + + // move the limbs left by `n` bits. + FASTFLOAT_CONSTEXPR20 bool shl(size_t n) noexcept { + size_t rem = n % limb_bits; + size_t div = n / limb_bits; + if (rem != 0) { + FASTFLOAT_TRY(shl_bits(rem)); + } + if (div != 0) { + FASTFLOAT_TRY(shl_limbs(div)); + } + return true; + } + + // get the number of leading zeros in the bigint. + FASTFLOAT_CONSTEXPR20 int ctlz() const noexcept { + if (vec.is_empty()) { + return 0; + } else { +#ifdef FASTFLOAT_64BIT_LIMB + return leading_zeroes(vec.rindex(0)); +#else + // no use defining a specialized leading_zeroes for a 32-bit type. + uint64_t r0 = vec.rindex(0); + return leading_zeroes(r0 << 32); +#endif + } + } + + // get the number of bits in the bigint. + FASTFLOAT_CONSTEXPR20 int bit_length() const noexcept { + int lz = ctlz(); + return int(limb_bits * vec.len()) - lz; + } + + FASTFLOAT_CONSTEXPR20 bool mul(limb y) noexcept { return small_mul(vec, y); } + + FASTFLOAT_CONSTEXPR20 bool add(limb y) noexcept { return small_add(vec, y); } + + // multiply as if by 2 raised to a power. + FASTFLOAT_CONSTEXPR20 bool pow2(uint32_t exp) noexcept { return shl(exp); } + + // multiply as if by 5 raised to a power. + FASTFLOAT_CONSTEXPR20 bool pow5(uint32_t exp) noexcept { + // multiply by a power of 5 + size_t large_length = sizeof(large_power_of_5) / sizeof(limb); + limb_span large = limb_span(large_power_of_5, large_length); + while (exp >= large_step) { + FASTFLOAT_TRY(large_mul(vec, large)); + exp -= large_step; + } +#ifdef FASTFLOAT_64BIT_LIMB + uint32_t small_step = 27; + limb max_native = 7450580596923828125UL; +#else + uint32_t small_step = 13; + limb max_native = 1220703125U; +#endif + while (exp >= small_step) { + FASTFLOAT_TRY(small_mul(vec, max_native)); + exp -= small_step; + } + if (exp != 0) { + // Work around clang bug https://godbolt.org/z/zedh7rrhc + // This is similar to https://github.com/llvm/llvm-project/issues/47746, + // except the workaround described there don't work here + FASTFLOAT_TRY(small_mul( + vec, limb(((void)small_power_of_5[0], small_power_of_5[exp])))); + } + + return true; + } + + // multiply as if by 10 raised to a power. + FASTFLOAT_CONSTEXPR20 bool pow10(uint32_t exp) noexcept { + FASTFLOAT_TRY(pow5(exp)); + return pow2(exp); + } +}; + +} // namespace fast_float + +#endif + +#ifndef FASTFLOAT_DIGIT_COMPARISON_H +#define FASTFLOAT_DIGIT_COMPARISON_H + +#include +#include +#include +#include + + +namespace fast_float { + +// 1e0 to 1e19 +constexpr static uint64_t powers_of_ten_uint64[] = {1UL, + 10UL, + 100UL, + 1000UL, + 10000UL, + 100000UL, + 1000000UL, + 10000000UL, + 100000000UL, + 1000000000UL, + 10000000000UL, + 100000000000UL, + 1000000000000UL, + 10000000000000UL, + 100000000000000UL, + 1000000000000000UL, + 10000000000000000UL, + 100000000000000000UL, + 1000000000000000000UL, + 10000000000000000000UL}; + +// calculate the exponent, in scientific notation, of the number. +// this algorithm is not even close to optimized, but it has no practical +// effect on performance: in order to have a faster algorithm, we'd need +// to slow down performance for faster algorithms, and this is still fast. +template +fastfloat_really_inline FASTFLOAT_CONSTEXPR14 int32_t +scientific_exponent(parsed_number_string_t &num) noexcept { + uint64_t mantissa = num.mantissa; + int32_t exponent = int32_t(num.exponent); + while (mantissa >= 10000) { + mantissa /= 10000; + exponent += 4; + } + while (mantissa >= 100) { + mantissa /= 100; + exponent += 2; + } + while (mantissa >= 10) { + mantissa /= 10; + exponent += 1; + } + return exponent; +} + +// this converts a native floating-point number to an extended-precision float. +template +fastfloat_really_inline FASTFLOAT_CONSTEXPR20 adjusted_mantissa +to_extended(T value) noexcept { + using equiv_uint = typename binary_format::equiv_uint; + constexpr equiv_uint exponent_mask = binary_format::exponent_mask(); + constexpr equiv_uint mantissa_mask = binary_format::mantissa_mask(); + constexpr equiv_uint hidden_bit_mask = binary_format::hidden_bit_mask(); + + adjusted_mantissa am; + int32_t bias = binary_format::mantissa_explicit_bits() - + binary_format::minimum_exponent(); + equiv_uint bits; +#if FASTFLOAT_HAS_BIT_CAST + bits = std::bit_cast(value); +#else + ::memcpy(&bits, &value, sizeof(T)); +#endif + if ((bits & exponent_mask) == 0) { + // denormal + am.power2 = 1 - bias; + am.mantissa = bits & mantissa_mask; + } else { + // normal + am.power2 = int32_t((bits & exponent_mask) >> + binary_format::mantissa_explicit_bits()); + am.power2 -= bias; + am.mantissa = (bits & mantissa_mask) | hidden_bit_mask; + } + + return am; +} + +// get the extended precision value of the halfway point between b and b+u. +// we are given a native float that represents b, so we need to adjust it +// halfway between b and b+u. +template +fastfloat_really_inline FASTFLOAT_CONSTEXPR20 adjusted_mantissa +to_extended_halfway(T value) noexcept { + adjusted_mantissa am = to_extended(value); + am.mantissa <<= 1; + am.mantissa += 1; + am.power2 -= 1; + return am; +} + +// round an extended-precision float to the nearest machine float. +template +fastfloat_really_inline FASTFLOAT_CONSTEXPR14 void round(adjusted_mantissa &am, + callback cb) noexcept { + int32_t mantissa_shift = 64 - binary_format::mantissa_explicit_bits() - 1; + if (-am.power2 >= mantissa_shift) { + // have a denormal float + int32_t shift = -am.power2 + 1; + cb(am, std::min(shift, 64)); + // check for round-up: if rounding-nearest carried us to the hidden bit. + am.power2 = (am.mantissa < + (uint64_t(1) << binary_format::mantissa_explicit_bits())) + ? 0 + : 1; + return; + } + + // have a normal float, use the default shift. + cb(am, mantissa_shift); + + // check for carry + if (am.mantissa >= + (uint64_t(2) << binary_format::mantissa_explicit_bits())) { + am.mantissa = (uint64_t(1) << binary_format::mantissa_explicit_bits()); + am.power2++; + } + + // check for infinite: we could have carried to an infinite power + am.mantissa &= ~(uint64_t(1) << binary_format::mantissa_explicit_bits()); + if (am.power2 >= binary_format::infinite_power()) { + am.power2 = binary_format::infinite_power(); + am.mantissa = 0; + } +} + +template +fastfloat_really_inline FASTFLOAT_CONSTEXPR14 void +round_nearest_tie_even(adjusted_mantissa &am, int32_t shift, + callback cb) noexcept { + const uint64_t mask = (shift == 64) ? UINT64_MAX : (uint64_t(1) << shift) - 1; + const uint64_t halfway = (shift == 0) ? 0 : uint64_t(1) << (shift - 1); + uint64_t truncated_bits = am.mantissa & mask; + bool is_above = truncated_bits > halfway; + bool is_halfway = truncated_bits == halfway; + + // shift digits into position + if (shift == 64) { + am.mantissa = 0; + } else { + am.mantissa >>= shift; + } + am.power2 += shift; + + bool is_odd = (am.mantissa & 1) == 1; + am.mantissa += uint64_t(cb(is_odd, is_halfway, is_above)); +} + +fastfloat_really_inline FASTFLOAT_CONSTEXPR14 void +round_down(adjusted_mantissa &am, int32_t shift) noexcept { + if (shift == 64) { + am.mantissa = 0; + } else { + am.mantissa >>= shift; + } + am.power2 += shift; +} +template +fastfloat_really_inline FASTFLOAT_CONSTEXPR20 void +skip_zeros(UC const *&first, UC const *last) noexcept { + uint64_t val; + while (!cpp20_and_in_constexpr() && + std::distance(first, last) >= int_cmp_len()) { + ::memcpy(&val, first, sizeof(uint64_t)); + if (val != int_cmp_zeros()) { + break; + } + first += int_cmp_len(); + } + while (first != last) { + if (*first != UC('0')) { + break; + } + first++; + } +} + +// determine if any non-zero digits were truncated. +// all characters must be valid digits. +template +fastfloat_really_inline FASTFLOAT_CONSTEXPR20 bool +is_truncated(UC const *first, UC const *last) noexcept { + // do 8-bit optimizations, can just compare to 8 literal 0s. + uint64_t val; + while (!cpp20_and_in_constexpr() && + std::distance(first, last) >= int_cmp_len()) { + ::memcpy(&val, first, sizeof(uint64_t)); + if (val != int_cmp_zeros()) { + return true; + } + first += int_cmp_len(); + } + while (first != last) { + if (*first != UC('0')) { + return true; + } + ++first; + } + return false; +} +template +fastfloat_really_inline FASTFLOAT_CONSTEXPR20 bool +is_truncated(span s) noexcept { + return is_truncated(s.ptr, s.ptr + s.len()); +} + +template +fastfloat_really_inline FASTFLOAT_CONSTEXPR20 void +parse_eight_digits(const UC *&p, limb &value, size_t &counter, + size_t &count) noexcept { + value = value * 100000000 + parse_eight_digits_unrolled(p); + p += 8; + counter += 8; + count += 8; +} + +template +fastfloat_really_inline FASTFLOAT_CONSTEXPR14 void +parse_one_digit(UC const *&p, limb &value, size_t &counter, + size_t &count) noexcept { + value = value * 10 + limb(*p - UC('0')); + p++; + counter++; + count++; +} + +fastfloat_really_inline FASTFLOAT_CONSTEXPR20 void +add_native(bigint &big, limb power, limb value) noexcept { + big.mul(power); + big.add(value); +} + +fastfloat_really_inline FASTFLOAT_CONSTEXPR20 void +round_up_bigint(bigint &big, size_t &count) noexcept { + // need to round-up the digits, but need to avoid rounding + // ....9999 to ...10000, which could cause a false halfway point. + add_native(big, 10, 1); + count++; +} + +// parse the significant digits into a big integer +template +inline FASTFLOAT_CONSTEXPR20 void +parse_mantissa(bigint &result, parsed_number_string_t &num, + size_t max_digits, size_t &digits) noexcept { + // try to minimize the number of big integer and scalar multiplication. + // therefore, try to parse 8 digits at a time, and multiply by the largest + // scalar value (9 or 19 digits) for each step. + size_t counter = 0; + digits = 0; + limb value = 0; +#ifdef FASTFLOAT_64BIT_LIMB + size_t step = 19; +#else + size_t step = 9; +#endif + + // process all integer digits. + UC const *p = num.integer.ptr; + UC const *pend = p + num.integer.len(); + skip_zeros(p, pend); + // process all digits, in increments of step per loop + while (p != pend) { + while ((std::distance(p, pend) >= 8) && (step - counter >= 8) && + (max_digits - digits >= 8)) { + parse_eight_digits(p, value, counter, digits); + } + while (counter < step && p != pend && digits < max_digits) { + parse_one_digit(p, value, counter, digits); + } + if (digits == max_digits) { + // add the temporary value, then check if we've truncated any digits + add_native(result, limb(powers_of_ten_uint64[counter]), value); + bool truncated = is_truncated(p, pend); + if (num.fraction.ptr != nullptr) { + truncated |= is_truncated(num.fraction); + } + if (truncated) { + round_up_bigint(result, digits); + } + return; + } else { + add_native(result, limb(powers_of_ten_uint64[counter]), value); + counter = 0; + value = 0; + } + } + + // add our fraction digits, if they're available. + if (num.fraction.ptr != nullptr) { + p = num.fraction.ptr; + pend = p + num.fraction.len(); + if (digits == 0) { + skip_zeros(p, pend); + } + // process all digits, in increments of step per loop + while (p != pend) { + while ((std::distance(p, pend) >= 8) && (step - counter >= 8) && + (max_digits - digits >= 8)) { + parse_eight_digits(p, value, counter, digits); + } + while (counter < step && p != pend && digits < max_digits) { + parse_one_digit(p, value, counter, digits); + } + if (digits == max_digits) { + // add the temporary value, then check if we've truncated any digits + add_native(result, limb(powers_of_ten_uint64[counter]), value); + bool truncated = is_truncated(p, pend); + if (truncated) { + round_up_bigint(result, digits); + } + return; + } else { + add_native(result, limb(powers_of_ten_uint64[counter]), value); + counter = 0; + value = 0; + } + } + } + + if (counter != 0) { + add_native(result, limb(powers_of_ten_uint64[counter]), value); + } +} + +template +inline FASTFLOAT_CONSTEXPR20 adjusted_mantissa +positive_digit_comp(bigint &bigmant, int32_t exponent) noexcept { + FASTFLOAT_ASSERT(bigmant.pow10(uint32_t(exponent))); + adjusted_mantissa answer; + bool truncated; + answer.mantissa = bigmant.hi64(truncated); + int bias = binary_format::mantissa_explicit_bits() - + binary_format::minimum_exponent(); + answer.power2 = bigmant.bit_length() - 64 + bias; + + round(answer, [truncated](adjusted_mantissa &a, int32_t shift) { + round_nearest_tie_even( + a, shift, + [truncated](bool is_odd, bool is_halfway, bool is_above) -> bool { + return is_above || (is_halfway && truncated) || + (is_odd && is_halfway); + }); + }); + + return answer; +} + +// the scaling here is quite simple: we have, for the real digits `m * 10^e`, +// and for the theoretical digits `n * 2^f`. Since `e` is always negative, +// to scale them identically, we do `n * 2^f * 5^-f`, so we now have `m * 2^e`. +// we then need to scale by `2^(f- e)`, and then the two significant digits +// are of the same magnitude. +template +inline FASTFLOAT_CONSTEXPR20 adjusted_mantissa negative_digit_comp( + bigint &bigmant, adjusted_mantissa am, int32_t exponent) noexcept { + bigint &real_digits = bigmant; + int32_t real_exp = exponent; + + // get the value of `b`, rounded down, and get a bigint representation of b+h + adjusted_mantissa am_b = am; + // gcc7 buf: use a lambda to remove the noexcept qualifier bug with + // -Wnoexcept-type. + round(am_b, + [](adjusted_mantissa &a, int32_t shift) { round_down(a, shift); }); + T b; + to_float(false, am_b, b); + adjusted_mantissa theor = to_extended_halfway(b); + bigint theor_digits(theor.mantissa); + int32_t theor_exp = theor.power2; + + // scale real digits and theor digits to be same power. + int32_t pow2_exp = theor_exp - real_exp; + uint32_t pow5_exp = uint32_t(-real_exp); + if (pow5_exp != 0) { + FASTFLOAT_ASSERT(theor_digits.pow5(pow5_exp)); + } + if (pow2_exp > 0) { + FASTFLOAT_ASSERT(theor_digits.pow2(uint32_t(pow2_exp))); + } else if (pow2_exp < 0) { + FASTFLOAT_ASSERT(real_digits.pow2(uint32_t(-pow2_exp))); + } + + // compare digits, and use it to director rounding + int ord = real_digits.compare(theor_digits); + adjusted_mantissa answer = am; + round(answer, [ord](adjusted_mantissa &a, int32_t shift) { + round_nearest_tie_even( + a, shift, [ord](bool is_odd, bool _, bool __) -> bool { + (void)_; // not needed, since we've done our comparison + (void)__; // not needed, since we've done our comparison + if (ord > 0) { + return true; + } else if (ord < 0) { + return false; + } else { + return is_odd; + } + }); + }); + + return answer; +} + +// parse the significant digits as a big integer to unambiguously round the +// the significant digits. here, we are trying to determine how to round +// an extended float representation close to `b+h`, halfway between `b` +// (the float rounded-down) and `b+u`, the next positive float. this +// algorithm is always correct, and uses one of two approaches. when +// the exponent is positive relative to the significant digits (such as +// 1234), we create a big-integer representation, get the high 64-bits, +// determine if any lower bits are truncated, and use that to direct +// rounding. in case of a negative exponent relative to the significant +// digits (such as 1.2345), we create a theoretical representation of +// `b` as a big-integer type, scaled to the same binary exponent as +// the actual digits. we then compare the big integer representations +// of both, and use that to direct rounding. +template +inline FASTFLOAT_CONSTEXPR20 adjusted_mantissa +digit_comp(parsed_number_string_t &num, adjusted_mantissa am) noexcept { + // remove the invalid exponent bias + am.power2 -= invalid_am_bias; + + int32_t sci_exp = scientific_exponent(num); + size_t max_digits = binary_format::max_digits(); + size_t digits = 0; + bigint bigmant; + parse_mantissa(bigmant, num, max_digits, digits); + // can't underflow, since digits is at most max_digits. + int32_t exponent = sci_exp + 1 - int32_t(digits); + if (exponent >= 0) { + return positive_digit_comp(bigmant, exponent); + } else { + return negative_digit_comp(bigmant, am, exponent); + } +} + +} // namespace fast_float + +#endif + +#ifndef FASTFLOAT_PARSE_NUMBER_H +#define FASTFLOAT_PARSE_NUMBER_H + + +#include +#include +#include +#include +namespace fast_float { + +namespace detail { +/** + * Special case +inf, -inf, nan, infinity, -infinity. + * The case comparisons could be made much faster given that we know that the + * strings a null-free and fixed. + **/ +template +from_chars_result_t FASTFLOAT_CONSTEXPR14 parse_infnan(UC const *first, + UC const *last, + T &value) noexcept { + from_chars_result_t answer{}; + answer.ptr = first; + answer.ec = std::errc(); // be optimistic + bool minusSign = false; + if (*first == + UC('-')) { // assume first < last, so dereference without checks; + // C++17 20.19.3.(7.1) explicitly forbids '+' here + minusSign = true; + ++first; + } +#ifdef FASTFLOAT_ALLOWS_LEADING_PLUS // disabled by default + if (*first == UC('+')) { + ++first; + } +#endif + if (last - first >= 3) { + if (fastfloat_strncasecmp(first, str_const_nan(), 3)) { + answer.ptr = (first += 3); + value = minusSign ? -std::numeric_limits::quiet_NaN() + : std::numeric_limits::quiet_NaN(); + // Check for possible nan(n-char-seq-opt), C++17 20.19.3.7, + // C11 7.20.1.3.3. At least MSVC produces nan(ind) and nan(snan). + if (first != last && *first == UC('(')) { + for (UC const *ptr = first + 1; ptr != last; ++ptr) { + if (*ptr == UC(')')) { + answer.ptr = ptr + 1; // valid nan(n-char-seq-opt) + break; + } else if (!((UC('a') <= *ptr && *ptr <= UC('z')) || + (UC('A') <= *ptr && *ptr <= UC('Z')) || + (UC('0') <= *ptr && *ptr <= UC('9')) || *ptr == UC('_'))) + break; // forbidden char, not nan(n-char-seq-opt) + } + } + return answer; + } + if (fastfloat_strncasecmp(first, str_const_inf(), 3)) { + if ((last - first >= 8) && + fastfloat_strncasecmp(first + 3, str_const_inf() + 3, 5)) { + answer.ptr = first + 8; + } else { + answer.ptr = first + 3; + } + value = minusSign ? -std::numeric_limits::infinity() + : std::numeric_limits::infinity(); + return answer; + } + } + answer.ec = std::errc::invalid_argument; + return answer; +} + +/** + * Returns true if the floating-pointing rounding mode is to 'nearest'. + * It is the default on most system. This function is meant to be inexpensive. + * Credit : @mwalcott3 + */ +fastfloat_really_inline bool rounds_to_nearest() noexcept { + // https://lemire.me/blog/2020/06/26/gcc-not-nearest/ +#if (FLT_EVAL_METHOD != 1) && (FLT_EVAL_METHOD != 0) + return false; +#endif + // See + // A fast function to check your floating-point rounding mode + // https://lemire.me/blog/2022/11/16/a-fast-function-to-check-your-floating-point-rounding-mode/ + // + // This function is meant to be equivalent to : + // prior: #include + // return fegetround() == FE_TONEAREST; + // However, it is expected to be much faster than the fegetround() + // function call. + // + // The volatile keywoard prevents the compiler from computing the function + // at compile-time. + // There might be other ways to prevent compile-time optimizations (e.g., + // asm). The value does not need to be std::numeric_limits::min(), any + // small value so that 1 + x should round to 1 would do (after accounting for + // excess precision, as in 387 instructions). + static volatile float fmin = std::numeric_limits::min(); + float fmini = fmin; // we copy it so that it gets loaded at most once. +// +// Explanation: +// Only when fegetround() == FE_TONEAREST do we have that +// fmin + 1.0f == 1.0f - fmin. +// +// FE_UPWARD: +// fmin + 1.0f > 1 +// 1.0f - fmin == 1 +// +// FE_DOWNWARD or FE_TOWARDZERO: +// fmin + 1.0f == 1 +// 1.0f - fmin < 1 +// +// Note: This may fail to be accurate if fast-math has been +// enabled, as rounding conventions may not apply. +#ifdef FASTFLOAT_VISUAL_STUDIO +#pragma warning(push) +// todo: is there a VS warning? +// see +// https://stackoverflow.com/questions/46079446/is-there-a-warning-for-floating-point-equality-checking-in-visual-studio-2013 +#elif defined(__clang__) +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wfloat-equal" +#elif defined(__GNUC__) +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wfloat-equal" +#endif + return (fmini + 1.0f == 1.0f - fmini); +#ifdef FASTFLOAT_VISUAL_STUDIO +#pragma warning(pop) +#elif defined(__clang__) +#pragma clang diagnostic pop +#elif defined(__GNUC__) +#pragma GCC diagnostic pop +#endif +} + +} // namespace detail + +template struct from_chars_caller { + template + FASTFLOAT_CONSTEXPR20 static from_chars_result_t + call(UC const *first, UC const *last, T &value, + parse_options_t options) noexcept { + return from_chars_advanced(first, last, value, options); + } +}; + +#if __STDCPP_FLOAT32_T__ == 1 +template <> struct from_chars_caller { + template + FASTFLOAT_CONSTEXPR20 static from_chars_result_t + call(UC const *first, UC const *last, std::float32_t &value, + parse_options_t options) noexcept { + // if std::float32_t is defined, and we are in C++23 mode; macro set for + // float32; set value to float due to equivalence between float and + // float32_t + float val; + auto ret = from_chars_advanced(first, last, val, options); + value = val; + return ret; + } +}; +#endif + +#if __STDCPP_FLOAT64_T__ == 1 +template <> struct from_chars_caller { + template + FASTFLOAT_CONSTEXPR20 static from_chars_result_t + call(UC const *first, UC const *last, std::float64_t &value, + parse_options_t options) noexcept { + // if std::float64_t is defined, and we are in C++23 mode; macro set for + // float64; set value as double due to equivalence between double and + // float64_t + double val; + auto ret = from_chars_advanced(first, last, val, options); + value = val; + return ret; + } +}; +#endif + +template +FASTFLOAT_CONSTEXPR20 from_chars_result_t +from_chars(UC const *first, UC const *last, T &value, + chars_format fmt /*= chars_format::general*/) noexcept { + return from_chars_caller::call(first, last, value, + parse_options_t(fmt)); +} + +/** + * This function overload takes parsed_number_string_t structure that is created + * and populated either by from_chars_advanced function taking chars range and + * parsing options or other parsing custom function implemented by user. + */ +template +FASTFLOAT_CONSTEXPR20 from_chars_result_t +from_chars_advanced(parsed_number_string_t &pns, T &value) noexcept { + + static_assert(is_supported_float_type(), + "only some floating-point types are supported"); + static_assert(is_supported_char_type(), + "only char, wchar_t, char16_t and char32_t are supported"); + + from_chars_result_t answer; + + answer.ec = std::errc(); // be optimistic + answer.ptr = pns.lastmatch; + // The implementation of the Clinger's fast path is convoluted because + // we want round-to-nearest in all cases, irrespective of the rounding mode + // selected on the thread. + // We proceed optimistically, assuming that detail::rounds_to_nearest() + // returns true. + if (binary_format::min_exponent_fast_path() <= pns.exponent && + pns.exponent <= binary_format::max_exponent_fast_path() && + !pns.too_many_digits) { + // Unfortunately, the conventional Clinger's fast path is only possible + // when the system rounds to the nearest float. + // + // We expect the next branch to almost always be selected. + // We could check it first (before the previous branch), but + // there might be performance advantages at having the check + // be last. + if (!cpp20_and_in_constexpr() && detail::rounds_to_nearest()) { + // We have that fegetround() == FE_TONEAREST. + // Next is Clinger's fast path. + if (pns.mantissa <= binary_format::max_mantissa_fast_path()) { + value = T(pns.mantissa); + if (pns.exponent < 0) { + value = value / binary_format::exact_power_of_ten(-pns.exponent); + } else { + value = value * binary_format::exact_power_of_ten(pns.exponent); + } + if (pns.negative) { + value = -value; + } + return answer; + } + } else { + // We do not have that fegetround() == FE_TONEAREST. + // Next is a modified Clinger's fast path, inspired by Jakub Jelínek's + // proposal + if (pns.exponent >= 0 && + pns.mantissa <= + binary_format::max_mantissa_fast_path(pns.exponent)) { +#if defined(__clang__) || defined(FASTFLOAT_32BIT) + // Clang may map 0 to -0.0 when fegetround() == FE_DOWNWARD + if (pns.mantissa == 0) { + value = pns.negative ? T(-0.) : T(0.); + return answer; + } +#endif + value = T(pns.mantissa) * + binary_format::exact_power_of_ten(pns.exponent); + if (pns.negative) { + value = -value; + } + return answer; + } + } + } + adjusted_mantissa am = + compute_float>(pns.exponent, pns.mantissa); + if (pns.too_many_digits && am.power2 >= 0) { + if (am != compute_float>(pns.exponent, pns.mantissa + 1)) { + am = compute_error>(pns.exponent, pns.mantissa); + } + } + // If we called compute_float>(pns.exponent, pns.mantissa) + // and we have an invalid power (am.power2 < 0), then we need to go the long + // way around again. This is very uncommon. + if (am.power2 < 0) { + am = digit_comp(pns, am); + } + to_float(pns.negative, am, value); + // Test for over/underflow. + if ((pns.mantissa != 0 && am.mantissa == 0 && am.power2 == 0) || + am.power2 == binary_format::infinite_power()) { + answer.ec = std::errc::result_out_of_range; + } + return answer; +} + +template +FASTFLOAT_CONSTEXPR20 from_chars_result_t +from_chars_advanced(UC const *first, UC const *last, T &value, + parse_options_t options) noexcept { + + static_assert(is_supported_float_type(), + "only some floating-point types are supported"); + static_assert(is_supported_char_type(), + "only char, wchar_t, char16_t and char32_t are supported"); + + from_chars_result_t answer; +#ifdef FASTFLOAT_SKIP_WHITE_SPACE // disabled by default + while ((first != last) && fast_float::is_space(uint8_t(*first))) { + first++; + } +#endif + if (first == last) { + answer.ec = std::errc::invalid_argument; + answer.ptr = first; + return answer; + } + parsed_number_string_t pns = + parse_number_string(first, last, options); + if (!pns.valid) { + if (options.format & chars_format::no_infnan) { + answer.ec = std::errc::invalid_argument; + answer.ptr = first; + return answer; + } else { + return detail::parse_infnan(first, last, value); + } + } + + // call overload that takes parsed_number_string_t directly. + return from_chars_advanced(pns, value); +} + +template +FASTFLOAT_CONSTEXPR20 from_chars_result_t +from_chars(UC const *first, UC const *last, T &value, int base) noexcept { + static_assert(is_supported_char_type(), + "only char, wchar_t, char16_t and char32_t are supported"); + + from_chars_result_t answer; +#ifdef FASTFLOAT_SKIP_WHITE_SPACE // disabled by default + while ((first != last) && fast_float::is_space(uint8_t(*first))) { + first++; + } +#endif + if (first == last || base < 2 || base > 36) { + answer.ec = std::errc::invalid_argument; + answer.ptr = first; + return answer; + } + return parse_int_string(first, last, value, base); +} + +} // namespace fast_float + +#endif diff --git a/deps/fast_float_c_interface/Makefile b/deps/fast_float_c_interface/Makefile new file mode 100644 index 0000000000..4db3efe2c3 --- /dev/null +++ b/deps/fast_float_c_interface/Makefile @@ -0,0 +1,37 @@ +CCCOLOR:="\033[34m" +SRCCOLOR:="\033[33m" +ENDCOLOR:="\033[0m" + +CXX?=c++ +# we need = instead of := so that $@ in QUIET_CXX gets evaluated in the rule and is assigned appropriate value. +TEMP:=$(CXX) +QUIET_CXX=@printf ' %b %b\n' $(CCCOLOR)C++$(ENDCOLOR) $(SRCCOLOR)$@$(ENDCOLOR) 1>&2; +CXX=$(QUIET_CXX)$(TEMP) + +WARN=-Wall -W -Wno-missing-field-initializers + +STD=-pedantic -std=c++11 + +OPT?=-O3 +CLANG := $(findstring clang,$(shell sh -c '$(CC) --version | head -1')) +ifeq ($(OPT),-O3) + ifeq (clang,$(CLANG)) + OPT+=-flto + else + OPT+=-flto=auto -ffat-lto-objects + endif +endif + +# 1) Today src/Makefile passes -m32 flag for explicit 32-bit build on 64-bit machine, via CFLAGS. For 32-bit build on +# 32-bit machine and 64-bit on 64-bit machine, CFLAGS are empty. No other flags are set that can conflict with C++, +# therefore let's use CFLAGS without changes for now. +# 2) FASTFLOAT_ALLOWS_LEADING_PLUS allows +inf to be parsed as inf, instead of error. +CXXFLAGS=$(STD) $(OPT) $(WARN) -static -fPIC -fno-exceptions $(CFLAGS) -D FASTFLOAT_ALLOWS_LEADING_PLUS + +.PHONY: all clean + +all: fast_float_strtod.o + +clean: + rm -f *.o || true; + diff --git a/deps/fast_float_c_interface/fast_float_strtod.cpp b/deps/fast_float_c_interface/fast_float_strtod.cpp new file mode 100644 index 0000000000..8e5d19470f --- /dev/null +++ b/deps/fast_float_c_interface/fast_float_strtod.cpp @@ -0,0 +1,24 @@ +/* + * Copyright Valkey Contributors. + * All rights reserved. + * SPDX-License-Identifier: BSD 3-Clause + */ + +#include "../fast_float/fast_float.h" +#include + +extern "C" +{ + double fast_float_strtod(const char *str, const char** endptr) + { + double temp = 0; + auto answer = fast_float::from_chars(str, str + strlen(str), temp); + if (answer.ec != std::errc()) { + errno = (answer.ec == std::errc::result_out_of_range) ? ERANGE : EINVAL; + } + if (endptr) { + *endptr = answer.ptr; + } + return temp; + } +} diff --git a/deps/fpconv/CMakeLists.txt b/deps/fpconv/CMakeLists.txt new file mode 100644 index 0000000000..c586aa650a --- /dev/null +++ b/deps/fpconv/CMakeLists.txt @@ -0,0 +1,4 @@ +project(fpconv) + +set(SRCS "${CMAKE_CURRENT_LIST_DIR}/fpconv_dtoa.c" "${CMAKE_CURRENT_LIST_DIR}/fpconv_dtoa.h") +add_library(fpconv STATIC ${SRCS}) diff --git a/deps/hdr_histogram/CMakeLists.txt b/deps/hdr_histogram/CMakeLists.txt new file mode 100644 index 0000000000..7b45bd76ba --- /dev/null +++ b/deps/hdr_histogram/CMakeLists.txt @@ -0,0 +1,7 @@ +project(hdr_histogram) + +set(SRCS "${CMAKE_CURRENT_LIST_DIR}/hdr_histogram.c" "${CMAKE_CURRENT_LIST_DIR}/hdr_histogram.h" + "${CMAKE_CURRENT_LIST_DIR}/hdr_atomic.h" "${CMAKE_CURRENT_LIST_DIR}/hdr_redis_malloc.h") + +add_library(hdr_histogram STATIC ${SRCS}) +target_compile_definitions(hdr_histogram PRIVATE HDR_MALLOC_INCLUDE=\"hdr_redis_malloc.h\") diff --git a/deps/hiredis/.github/workflows/build.yml b/deps/hiredis/.github/workflows/build.yml index 581800b4f7..048ee51cd4 100644 --- a/deps/hiredis/.github/workflows/build.yml +++ b/deps/hiredis/.github/workflows/build.yml @@ -112,7 +112,7 @@ jobs: run: $GITHUB_WORKSPACE/test.sh freebsd: - runs-on: macos-12 + runs-on: macos-13 name: FreeBSD steps: - uses: actions/checkout@v3 diff --git a/deps/jemalloc/CMakeLists.txt b/deps/jemalloc/CMakeLists.txt new file mode 100644 index 0000000000..0fa99df55e --- /dev/null +++ b/deps/jemalloc/CMakeLists.txt @@ -0,0 +1,32 @@ +project(jemalloc) + +# Build jemalloc using configure && make install +set(JEMALLOC_INSTALL_DIR ${CMAKE_BINARY_DIR}/jemalloc-build) +set(JEMALLOC_SRC_DIR ${CMAKE_CURRENT_LIST_DIR}) +if (NOT EXISTS ${JEMALLOC_INSTALL_DIR}/lib/libjemalloc.a) + message(STATUS "Building jemalloc (custom build)") + message(STATUS "JEMALLOC_SRC_DIR = ${JEMALLOC_SRC_DIR}") + message(STATUS "JEMALLOC_INSTALL_DIR = ${JEMALLOC_INSTALL_DIR}") + + execute_process( + COMMAND sh -c "${JEMALLOC_SRC_DIR}/configure --disable-cxx \ + --with-version=5.3.0-0-g0 --with-lg-quantum=3 --disable-cache-oblivious --with-jemalloc-prefix=je_ \ + --enable-static --disable-shared --prefix=${JEMALLOC_INSTALL_DIR}" + WORKING_DIRECTORY ${JEMALLOC_SRC_DIR} RESULTS_VARIABLE CONFIGURE_RESULT) + + if (NOT ${CONFIGURE_RESULT} EQUAL 0) + message(FATAL_ERROR "Jemalloc configure failed") + endif () + + execute_process(COMMAND make -j${VALKEY_PROCESSOR_COUNT} lib/libjemalloc.a install + WORKING_DIRECTORY "${JEMALLOC_SRC_DIR}" RESULTS_VARIABLE MAKE_RESULT) + + if (NOT ${MAKE_RESULT} EQUAL 0) + message(FATAL_ERROR "Jemalloc build failed") + endif () +endif () + +# Import the compiled library as a CMake target +add_library(jemalloc STATIC IMPORTED GLOBAL) +set_target_properties(jemalloc PROPERTIES IMPORTED_LOCATION "${JEMALLOC_INSTALL_DIR}/lib/libjemalloc.a" + INCLUDE_DIRECTORIES "${JEMALLOC_INSTALL_DIR}/include") diff --git a/deps/jemalloc/include/jemalloc/internal/jemalloc_internal_inlines_c.h b/deps/jemalloc/include/jemalloc/internal/jemalloc_internal_inlines_c.h index 2cd7e7ce93..b0868b7d61 100644 --- a/deps/jemalloc/include/jemalloc/internal/jemalloc_internal_inlines_c.h +++ b/deps/jemalloc/include/jemalloc/internal/jemalloc_internal_inlines_c.h @@ -337,55 +337,4 @@ imalloc_fastpath(size_t size, void *(fallback_alloc)(size_t)) { return fallback_alloc(size); } -JEMALLOC_ALWAYS_INLINE int -iget_defrag_hint(tsdn_t *tsdn, void* ptr) { - int defrag = 0; - emap_alloc_ctx_t alloc_ctx; - emap_alloc_ctx_lookup(tsdn, &arena_emap_global, ptr, &alloc_ctx); - if (likely(alloc_ctx.slab)) { - /* Small allocation. */ - edata_t *slab = emap_edata_lookup(tsdn, &arena_emap_global, ptr); - arena_t *arena = arena_get_from_edata(slab); - szind_t binind = edata_szind_get(slab); - unsigned binshard = edata_binshard_get(slab); - bin_t *bin = arena_get_bin(arena, binind, binshard); - malloc_mutex_lock(tsdn, &bin->lock); - arena_dalloc_bin_locked_info_t info; - arena_dalloc_bin_locked_begin(&info, binind); - /* Don't bother moving allocations from the slab currently used for new allocations */ - if (slab != bin->slabcur) { - int free_in_slab = edata_nfree_get(slab); - if (free_in_slab) { - const bin_info_t *bin_info = &bin_infos[binind]; - /* Find number of non-full slabs and the number of regs in them */ - unsigned long curslabs = 0; - size_t curregs = 0; - /* Run on all bin shards (usually just one) */ - for (uint32_t i=0; i< bin_info->n_shards; i++) { - bin_t *bb = arena_get_bin(arena, binind, i); - curslabs += bb->stats.nonfull_slabs; - /* Deduct the regs in full slabs (they're not part of the game) */ - unsigned long full_slabs = bb->stats.curslabs - bb->stats.nonfull_slabs; - curregs += bb->stats.curregs - full_slabs * bin_info->nregs; - if (bb->slabcur) { - /* Remove slabcur from the overall utilization (not a candidate to nove from) */ - curregs -= bin_info->nregs - edata_nfree_get(bb->slabcur); - curslabs -= 1; - } - } - /* Compare the utilization ratio of the slab in question to the total average - * among non-full slabs. To avoid precision loss in division, we do that by - * extrapolating the usage of the slab as if all slabs have the same usage. - * If this slab is less used than the average, we'll prefer to move the data - * to hopefully more used ones. To avoid stagnation when all slabs have the same - * utilization, we give additional 12.5% weight to the decision to defrag. */ - defrag = (bin_info->nregs - free_in_slab) * curslabs <= curregs + curregs / 8; - } - } - arena_dalloc_bin_locked_finish(tsdn, arena, bin, &info); - malloc_mutex_unlock(tsdn, &bin->lock); - } - return defrag; -} - #endif /* JEMALLOC_INTERNAL_INLINES_C_H */ diff --git a/deps/jemalloc/include/jemalloc/jemalloc_macros.h.in b/deps/jemalloc/include/jemalloc/jemalloc_macros.h.in index d04af34d93..ebb3137e6f 100644 --- a/deps/jemalloc/include/jemalloc/jemalloc_macros.h.in +++ b/deps/jemalloc/include/jemalloc/jemalloc_macros.h.in @@ -147,7 +147,3 @@ #else # define JEMALLOC_SYS_NOTHROW JEMALLOC_NOTHROW #endif - -/* This version of Jemalloc, modified for Redis, has the je_get_defrag_hint() - * function. */ -#define JEMALLOC_FRAG_HINT diff --git a/deps/jemalloc/src/jemalloc.c b/deps/jemalloc/src/jemalloc.c index 83026093be..ea9232c5d6 100644 --- a/deps/jemalloc/src/jemalloc.c +++ b/deps/jemalloc/src/jemalloc.c @@ -4474,12 +4474,3 @@ jemalloc_postfork_child(void) { } /******************************************************************************/ - -/* Helps the application decide if a pointer is worth re-allocating in order to reduce fragmentation. - * returns 1 if the allocation should be moved, and 0 if the allocation be kept. - * If the application decides to re-allocate it should use MALLOCX_TCACHE_NONE when doing so. */ -JEMALLOC_EXPORT int JEMALLOC_NOTHROW -get_defrag_hint(void* ptr) { - assert(ptr != NULL); - return iget_defrag_hint(TSDN_NULL, ptr); -} diff --git a/deps/linenoise/CMakeLists.txt b/deps/linenoise/CMakeLists.txt new file mode 100644 index 0000000000..f801e4abf1 --- /dev/null +++ b/deps/linenoise/CMakeLists.txt @@ -0,0 +1,4 @@ +project(linenoise) + +set(SRCS "${CMAKE_CURRENT_LIST_DIR}/linenoise.c" "${CMAKE_CURRENT_LIST_DIR}/linenoise.h") +add_library(linenoise STATIC ${SRCS}) diff --git a/deps/lua/CMakeLists.txt b/deps/lua/CMakeLists.txt new file mode 100644 index 0000000000..0629d7f978 --- /dev/null +++ b/deps/lua/CMakeLists.txt @@ -0,0 +1,53 @@ +project(lualib) + +include(CheckFunctionExists) + +set(LUA_SRC_DIR "${CMAKE_CURRENT_LIST_DIR}/src") +set(LUA_SRCS + ${LUA_SRC_DIR}/fpconv.c + ${LUA_SRC_DIR}/lbaselib.c + ${LUA_SRC_DIR}/lmathlib.c + ${LUA_SRC_DIR}/lstring.c + ${LUA_SRC_DIR}/lparser.c + ${LUA_SRC_DIR}/ldo.c + ${LUA_SRC_DIR}/lzio.c + ${LUA_SRC_DIR}/lmem.c + ${LUA_SRC_DIR}/strbuf.c + ${LUA_SRC_DIR}/lstrlib.c + ${LUA_SRC_DIR}/lundump.c + ${LUA_SRC_DIR}/lua_cmsgpack.c + ${LUA_SRC_DIR}/loslib.c + ${LUA_SRC_DIR}/lua_struct.c + ${LUA_SRC_DIR}/ldebug.c + ${LUA_SRC_DIR}/lobject.c + ${LUA_SRC_DIR}/ldump.c + ${LUA_SRC_DIR}/lua_cjson.c + ${LUA_SRC_DIR}/ldblib.c + ${LUA_SRC_DIR}/ltm.c + ${LUA_SRC_DIR}/ltable.c + ${LUA_SRC_DIR}/lstate.c + ${LUA_SRC_DIR}/lua_bit.c + ${LUA_SRC_DIR}/lua.c + ${LUA_SRC_DIR}/loadlib.c + ${LUA_SRC_DIR}/lcode.c + ${LUA_SRC_DIR}/lapi.c + ${LUA_SRC_DIR}/lgc.c + ${LUA_SRC_DIR}/lvm.c + ${LUA_SRC_DIR}/lfunc.c + ${LUA_SRC_DIR}/lauxlib.c + ${LUA_SRC_DIR}/ltablib.c + ${LUA_SRC_DIR}/linit.c + ${LUA_SRC_DIR}/lopcodes.c + ${LUA_SRC_DIR}/llex.c + ${LUA_SRC_DIR}/liolib.c) + +add_library(lualib STATIC "${LUA_SRCS}") +target_include_directories(lualib PUBLIC "${LUA_SRC_DIR}") +target_compile_definitions(lualib PRIVATE ENABLE_CJSON_GLOBAL) + +# Use mkstemp if available +check_function_exists(mkstemp HAVE_MKSTEMP) +if (HAVE_MKSTEMP) + target_compile_definitions(lualib PRIVATE LUA_USE_MKSTEMP) +endif () +unset(HAVE_MKSTEMP CACHE) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt new file mode 100644 index 0000000000..b87dff3db0 --- /dev/null +++ b/src/CMakeLists.txt @@ -0,0 +1,87 @@ +project(valkey-server) + +set(INSTALL_BIN_PATH ${CMAKE_INSTALL_PREFIX}/bin) +set_directory_properties(PROPERTIES CLEAN_NO_CUSTOM 1) + +# Target: valkey-server +set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${VALKEY_SERVER_CFLAGS}") +message(STATUS "CFLAGS: ${CMAKE_C_FLAGS}") + +get_valkey_server_linker_option(VALKEY_SERVER_LDFLAGS) +list(APPEND SERVER_LIBS "fpconv") +list(APPEND SERVER_LIBS "lualib") +list(APPEND SERVER_LIBS "hdr_histogram") +valkey_build_and_install_bin(valkey-server "${VALKEY_SERVER_SRCS}" "${VALKEY_SERVER_LDFLAGS}" "${SERVER_LIBS}" + "redis-server") +add_dependencies(valkey-server generate_commands_def) +add_dependencies(valkey-server generate_fmtargs_h) +add_dependencies(valkey-server release_header) + +if (VALKEY_RELEASE_BUILD) + # Enable LTO for Release build + set_property(TARGET valkey-server PROPERTY INTERPROCEDURAL_OPTIMIZATION TRUE) +endif () + +if (BUILD_SANITIZER) + # 'BUILD_SANITIZER' is defined in ValkeySetup module (based on user input) + # If defined, the variables 'VALKEY_SANITAIZER_CFLAGS' and 'VALKEY_SANITAIZER_LDFLAGS' + # are set with the link & compile flags required + message(STATUS "Adding sanitizer flags for target valkey-server") + target_compile_options(valkey-server PRIVATE ${VALKEY_SANITAIZER_CFLAGS}) + target_link_options(valkey-server PRIVATE ${VALKEY_SANITAIZER_LDFLAGS}) +endif () +unset(BUILD_SANITIZER CACHE) + +# Target: valkey-cli +list(APPEND CLI_LIBS "linenoise") +valkey_build_and_install_bin(valkey-cli "${VALKEY_CLI_SRCS}" "${VALKEY_SERVER_LDFLAGS}" "${CLI_LIBS}" "redis-cli") +add_dependencies(valkey-cli generate_commands_def) +add_dependencies(valkey-cli generate_fmtargs_h) + +# Target: valkey-benchmark +list(APPEND BENCH_LIBS "hdr_histogram") +valkey_build_and_install_bin(valkey-benchmark "${VALKEY_BENCHMARK_SRCS}" "${VALKEY_SERVER_LDFLAGS}" "${BENCH_LIBS}" + "redis-benchmark") +add_dependencies(valkey-benchmark generate_commands_def) +add_dependencies(valkey-benchmark generate_fmtargs_h) + +# Targets: valkey-sentinel, valkey-check-aof and valkey-check-rdb are just symbolic links +valkey_create_symlink("valkey-server" "valkey-sentinel") +valkey_create_symlink("valkey-server" "valkey-check-rdb") +valkey_create_symlink("valkey-server" "valkey-check-aof") + +# Target valkey-rdma +if (BUILD_RDMA_MODULE) + set(MODULE_NAME "valkey-rdma") + message(STATUS "Building RDMA module") + add_library(${MODULE_NAME} SHARED "${VALKEY_RDMA_MODULE_SRCS}") + target_compile_options(${MODULE_NAME} PRIVATE -DBUILD_RDMA_MODULE=2 -DUSE_RDMA=1) + target_link_libraries(${MODULE_NAME} "${RDMA_LIBS}") + # remove the "lib" prefix from the module + set_target_properties(${MODULE_NAME} PROPERTIES PREFIX "") + valkey_install_bin(${MODULE_NAME}) +endif () + +# Target valkey-tls (a module) +if (BUILD_TLS_MODULE) + message(STATUS "Building TLS as a module") + set(MODULE_NAME "valkey-tls") + add_library(${MODULE_NAME} SHARED ${VALKEY_TLS_MODULE_SRCS}) + target_compile_options(${MODULE_NAME} PRIVATE -DUSE_OPENSSL=2 -DBUILD_TLS_MODULE=2) + if (APPLE) + # Some symbols can only be resolved during runtime (they exist in the executable) + target_link_options(${MODULE_NAME} PRIVATE -undefined dynamic_lookup) + endif () + target_link_libraries(${MODULE_NAME} hiredis_ssl OpenSSL::SSL) + set_target_properties(${MODULE_NAME} PROPERTIES PREFIX "") +endif () + +if (BUILD_EXAMPLE_MODULES) + # Include the modules ("hello*") + message(STATUS "Building example modules") + add_subdirectory(modules) +endif () + +if (BUILD_UNIT_TESTS) + add_subdirectory(unit) +endif () diff --git a/src/Makefile b/src/Makefile index 020b70d6d5..3b4ad0a2ef 100644 --- a/src/Makefile +++ b/src/Makefile @@ -25,7 +25,7 @@ ifeq ($(OPTIMIZATION),-O3) ifeq (clang,$(CLANG)) OPTIMIZATION+=-flto else - OPTIMIZATION+=-flto=auto + OPTIMIZATION+=-flto=auto -ffat-lto-objects endif endif ifneq ($(OPTIMIZATION),-O0) @@ -98,15 +98,6 @@ ifeq ($(USE_JEMALLOC),no) MALLOC=libc endif -# Some unit tests compile files a second time to get access to static functions, the "--allow-multiple-definition" flag -# allows us to do that without an error, by using the first instance of function. This behavior can also be used -# to tweak behavior of code just for unit tests. The version of ld on MacOS apparently always does this. -ifneq ($(uname_S),Darwin) - ALLOW_DUPLICATE_FLAG=-Wl,--allow-multiple-definition -else - ALLOW_DUPLICATE_FLAG= -endif - ifdef SANITIZER ifeq ($(SANITIZER),address) MALLOC=libc @@ -140,9 +131,6 @@ ifdef REDIS_LDFLAGS endif FINAL_CFLAGS=$(STD) $(WARN) $(OPT) $(DEBUG) $(CFLAGS) $(SERVER_CFLAGS) -ifeq ($(SERVER_TEST),yes) - FINAL_CFLAGS +=-DSERVER_TEST=1 -endif FINAL_LDFLAGS=$(LDFLAGS) $(OPT) $(SERVER_LDFLAGS) $(DEBUG) FINAL_LIBS=-lm DEBUG=-g -ggdb @@ -337,26 +325,26 @@ ifeq ($(BUILD_TLS),module) TLS_MODULE_CFLAGS+=-DUSE_OPENSSL=$(BUILD_MODULE) $(OPENSSL_CFLAGS) -DBUILD_TLS_MODULE=$(BUILD_MODULE) endif -BUILD_RDMA:=no -RDMA_MODULE= -RDMA_MODULE_NAME:=valkey-rdma$(PROG_SUFFIX).so -RDMA_MODULE_CFLAGS:=$(FINAL_CFLAGS) -ifeq ($(BUILD_RDMA),module) - FINAL_CFLAGS+=-DUSE_RDMA=$(BUILD_MODULE) - RDMA_PKGCONFIG := $(shell $(PKG_CONFIG) --exists librdmacm libibverbs && echo $$?) +RDMA_LIBS= +RDMA_PKGCONFIG := $(shell $(PKG_CONFIG) --exists librdmacm libibverbs && echo $$?) ifeq ($(RDMA_PKGCONFIG),0) RDMA_LIBS=$(shell $(PKG_CONFIG) --libs librdmacm libibverbs) else RDMA_LIBS=-lrdmacm -libverbs endif - RDMA_MODULE=$(RDMA_MODULE_NAME) - RDMA_MODULE_CFLAGS+=-DUSE_RDMA=$(BUILD_YES) -DBUILD_RDMA_MODULE $(RDMA_LIBS) -else -ifeq ($(BUILD_RDMA),no) - # disable RDMA, do nothing -else - $(error "RDMA is only supported as module (BUILD_RDMA=module), or disabled (BUILD_RDMA=no)") + +ifeq ($(BUILD_RDMA),yes) + FINAL_CFLAGS+=-DUSE_RDMA=$(BUILD_YES) -DBUILD_RDMA_MODULE=$(BUILD_NO) + FINAL_LIBS += $(RDMA_LIBS) endif + +RDMA_MODULE= +RDMA_MODULE_NAME:=valkey-rdma$(PROG_SUFFIX).so +RDMA_MODULE_CFLAGS:=$(FINAL_CFLAGS) +ifeq ($(BUILD_RDMA),module) + FINAL_CFLAGS+=-DUSE_RDMA=$(BUILD_MODULE) + RDMA_MODULE=$(RDMA_MODULE_NAME) + RDMA_MODULE_CFLAGS+=-DUSE_RDMA=$(BUILD_MODULE) -DBUILD_RDMA_MODULE=$(BUILD_MODULE) $(RDMA_LIBS) endif ifndef V @@ -423,7 +411,7 @@ endif ENGINE_NAME=valkey SERVER_NAME=$(ENGINE_NAME)-server$(PROG_SUFFIX) ENGINE_SENTINEL_NAME=$(ENGINE_NAME)-sentinel$(PROG_SUFFIX) -ENGINE_SERVER_OBJ=threads_mngr.o adlist.o quicklist.o ae.o anet.o dict.o kvstore.o server.o sds.o zmalloc.o lzf_c.o lzf_d.o pqsort.o zipmap.o sha1.o ziplist.o release.o memory_prefetch.o io_threads.o networking.o util.o object.o db.o replication.o rdb.o t_string.o t_list.o t_set.o t_zset.o t_hash.o config.o aof.o pubsub.o multi.o debug.o sort.o intset.o syncio.o cluster.o cluster_legacy.o cluster_slot_stats.o crc16.o endianconv.o slowlog.o eval.o bio.o rio.o rand.o memtest.o syscheck.o crcspeed.o crccombine.o crc64.o bitops.o sentinel.o notify.o setproctitle.o blocked.o hyperloglog.o latency.o sparkline.o valkey-check-rdb.o valkey-check-aof.o geo.o lazyfree.o module.o evict.o expire.o geohash.o geohash_helper.o childinfo.o defrag.o siphash.o rax.o t_stream.o listpack.o localtime.o lolwut.o lolwut5.o lolwut6.o acl.o tracking.o socket.o tls.o sha256.o timeout.o setcpuaffinity.o monotonic.o mt19937-64.o resp_parser.o call_reply.o script_lua.o script.o functions.o function_lua.o commands.o strl.o connection.o unix.o logreqres.o +ENGINE_SERVER_OBJ=threads_mngr.o adlist.o quicklist.o ae.o anet.o dict.o kvstore.o server.o sds.o zmalloc.o lzf_c.o lzf_d.o pqsort.o zipmap.o sha1.o ziplist.o release.o memory_prefetch.o io_threads.o networking.o util.o object.o db.o replication.o rdb.o t_string.o t_list.o t_set.o t_zset.o t_hash.o config.o aof.o pubsub.o multi.o debug.o sort.o intset.o syncio.o cluster.o cluster_legacy.o cluster_slot_stats.o crc16.o endianconv.o slowlog.o eval.o bio.o rio.o rand.o memtest.o syscheck.o crcspeed.o crccombine.o crc64.o bitops.o sentinel.o notify.o setproctitle.o blocked.o hyperloglog.o latency.o sparkline.o valkey-check-rdb.o valkey-check-aof.o geo.o lazyfree.o module.o evict.o expire.o geohash.o geohash_helper.o childinfo.o allocator_defrag.o defrag.o siphash.o rax.o t_stream.o listpack.o localtime.o lolwut.o lolwut5.o lolwut6.o acl.o tracking.o socket.o tls.o sha256.o timeout.o setcpuaffinity.o monotonic.o mt19937-64.o resp_parser.o call_reply.o script_lua.o script.o functions.o function_lua.o commands.o strl.o connection.o unix.o logreqres.o rdma.o ENGINE_CLI_NAME=$(ENGINE_NAME)-cli$(PROG_SUFFIX) ENGINE_CLI_OBJ=anet.o adlist.o dict.o valkey-cli.o zmalloc.o release.o ae.o serverassert.o crcspeed.o crccombine.o crc64.o siphash.o crc16.o monotonic.o cli_common.o mt19937-64.o strl.o cli_commands.o ENGINE_BENCHMARK_NAME=$(ENGINE_NAME)-benchmark$(PROG_SUFFIX) @@ -436,6 +424,17 @@ ENGINE_TEST_OBJ:=$(sort $(patsubst unit/%.c,unit/%.o,$(ENGINE_TEST_FILES))) ENGINE_UNIT_TESTS:=$(ENGINE_NAME)-unit-tests$(PROG_SUFFIX) ALL_SOURCES=$(sort $(patsubst %.o,%.c,$(ENGINE_SERVER_OBJ) $(ENGINE_CLI_OBJ) $(ENGINE_BENCHMARK_OBJ))) +USE_FAST_FLOAT?=no +ifeq ($(USE_FAST_FLOAT),yes) + # valkey_strtod.h uses this flag to switch valkey_strtod function to fast_float_strtod, + # therefore let's pass it to compiler for preprocessing. + FINAL_CFLAGS += -D USE_FAST_FLOAT + # next, let's build and add actual library containing fast_float_strtod function for linking. + DEPENDENCY_TARGETS += fast_float_c_interface + FAST_FLOAT_STRTOD_OBJECT := ../deps/fast_float_c_interface/fast_float_strtod.o + FINAL_LIBS += $(FAST_FLOAT_STRTOD_OBJECT) +endif + all: $(SERVER_NAME) $(ENGINE_SENTINEL_NAME) $(ENGINE_CLI_NAME) $(ENGINE_BENCHMARK_NAME) $(ENGINE_CHECK_RDB_NAME) $(ENGINE_CHECK_AOF_NAME) $(TLS_MODULE) $(RDMA_MODULE) @echo "" @echo "Hint: It's a good idea to run 'make test' ;)" @@ -494,7 +493,7 @@ $(ENGINE_LIB_NAME): $(ENGINE_SERVER_OBJ) # valkey-unit-tests $(ENGINE_UNIT_TESTS): $(ENGINE_TEST_OBJ) $(ENGINE_LIB_NAME) - $(SERVER_LD) $(ALLOW_DUPLICATE_FLAG) -o $@ $^ ../deps/hiredis/libhiredis.a ../deps/lua/src/liblua.a ../deps/hdr_histogram/libhdrhistogram.a ../deps/fpconv/libfpconv.a $(FINAL_LIBS) + $(SERVER_LD) -o $@ $^ ../deps/hiredis/libhiredis.a ../deps/lua/src/liblua.a ../deps/hdr_histogram/libhdrhistogram.a ../deps/fpconv/libfpconv.a $(FINAL_LIBS) # valkey-sentinel $(ENGINE_SENTINEL_NAME): $(SERVER_NAME) @@ -600,7 +599,7 @@ bench: $(ENGINE_BENCHMARK_NAME) 32bit: @echo "" - @echo "WARNING: if it fails under Linux you probably need to install libc6-dev-i386" + @echo "WARNING: if it fails under Linux you probably need to install libc6-dev-i386 and libstdc++-11-dev-i386-cross" @echo "" $(MAKE) all-with-unit-tests CFLAGS="-m32" LDFLAGS="-m32" diff --git a/src/ae.c b/src/ae.c index 9bf8619902..643ff17070 100644 --- a/src/ae.c +++ b/src/ae.c @@ -85,7 +85,7 @@ aeEventLoop *aeCreateEventLoop(int setsize) { if (eventLoop->events == NULL || eventLoop->fired == NULL) goto err; eventLoop->setsize = setsize; eventLoop->timeEventHead = NULL; - eventLoop->timeEventNextId = 0; + eventLoop->timeEventNextId = 1; eventLoop->stop = 0; eventLoop->maxfd = -1; eventLoop->beforesleep = NULL; diff --git a/src/allocator_defrag.c b/src/allocator_defrag.c new file mode 100644 index 0000000000..b2330c95e0 --- /dev/null +++ b/src/allocator_defrag.c @@ -0,0 +1,426 @@ +/* Copyright 2024- Valkey contributors + * All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + */ + +/* + * This file implements allocator-specific defragmentation logic used + * within the Valkey engine. Below is the relationship between various + * components involved in allocation and defragmentation: + * + * Application code + * / \ + * allocation / \ defrag + * / \ + * zmalloc allocator_defrag + * / | \ / \ + * / | \ / \ + * / | \ / \ + * libc tcmalloc jemalloc other + * + * Explanation: + * - **Application code**: High-level application logic that uses memory + * allocation and may trigger defragmentation. + * - **zmalloc**: An abstraction layer over the memory allocator, providing + * a uniform allocation interface to the application code. It can delegate + * to various underlying allocators (e.g., libc, tcmalloc, jemalloc, or others). + * It is not dependant on defrag implementation logic and it's possible to use jemalloc + * version that does not support defrag. + * - **allocator_defrag**: This file contains allocator-specific logic for + * defragmentation, invoked from `defrag.c` when memory defragmentation is needed. + * currently jemalloc is the only allocator with implemented defrag logic. It is possible that + * future implementation will include non-allocator defragmentation (think of data-structure + * compaction for example). + * - **Underlying allocators**: These are the actual memory allocators, such as + * libc, tcmalloc, jemalloc, or other custom allocators. The defragmentation + * logic in `allocator_defrag` interacts with these allocators to reorganize + * memory and reduce fragmentation. + * + * The `defrag.c` file acts as the central entry point for defragmentation, + * invoking allocator-specific implementations provided here in `allocator_defrag.c`. + * + * Note: Developers working on `zmalloc` or `allocator_defrag` should refer to + * the other component to ensure both are using the same allocator configuration. + */ + +#include +#include "serverassert.h" +#include "allocator_defrag.h" + +#define UNUSED(x) (void)(x) + +#if defined(HAVE_DEFRAG) && defined(USE_JEMALLOC) + +#define STRINGIFY_(x) #x +#define STRINGIFY(x) STRINGIFY_(x) + +#define BATCH_QUERY_ARGS_OUT 3 +#define SLAB_NFREE(out, i) out[(i) * BATCH_QUERY_ARGS_OUT] +#define SLAB_LEN(out, i) out[(i) * BATCH_QUERY_ARGS_OUT + 2] +#define SLAB_NUM_REGS(out, i) out[(i) * BATCH_QUERY_ARGS_OUT + 1] + +#define UTILIZATION_THRESHOLD_FACTOR_MILI (125) // 12.5% additional utilization + +/* + * Represents a precomputed key for querying jemalloc statistics. + * + * The `jeMallctlKey` structure stores a key corresponding to a specific jemalloc + * statistics field name. This key is used with the `je_mallctlbymib` interface + * to query statistics more efficiently, bypassing the need for runtime string + * lookup and translation performed by `je_mallctl`. + * + * - `je_mallctlnametomib` is called once for each statistics field to precompute + * and store the key corresponding to the field name. + * - Subsequent queries use `je_mallctlbymib` with the stored key, avoiding the + * overhead of repeated string-based lookups. + * + */ +typedef struct jeMallctlKey { + size_t key[6]; /* The precomputed key used to query jemalloc statistics. */ + size_t keylen; /* The length of the key array. */ +} jeMallctlKey; + +/* Stores MIB (Management Information Base) keys for jemalloc bin queries. + * + * This struct holds precomputed `jeMallctlKey` values for querying various + * jemalloc bin-related statistics efficiently. + */ +typedef struct jeBinInfoKeys { + jeMallctlKey curr_slabs; /* Key to query the current number of slabs in the bin. */ + jeMallctlKey nonfull_slabs; /* Key to query the number of non-full slabs in the bin. */ + jeMallctlKey curr_regs; /* Key to query the current number of regions in the bin. */ +} jeBinInfoKeys; + +/* Represents detailed information about a jemalloc bin. + * + * This struct provides metadata about a jemalloc bin, including the size of + * its regions, total number of regions, and related MIB keys for efficient + * queries. + */ +typedef struct jeBinInfo { + size_t reg_size; /* Size of each region in the bin. */ + uint32_t nregs; /* Total number of regions in the bin. */ + jeBinInfoKeys info_keys; /* Precomputed MIB keys for querying bin statistics. */ +} jeBinInfo; + +/* Represents the configuration for jemalloc bins. + * + * This struct contains information about the number of bins and metadata for + * each bin, as well as precomputed keys for batch utility queries and epoch updates. + */ +typedef struct jemallocCB { + unsigned nbins; /* Number of bins in the jemalloc configuration. */ + jeBinInfo *bin_info; /* Array of `jeBinInfo` structs, one for each bin. */ + jeMallctlKey util_batch_query; /* Key to query batch utilization information. */ + jeMallctlKey epoch; /* Key to trigger statistics sync between threads. */ +} jemallocCB; + +/* Represents the latest usage statistics for a jemalloc bin. + * + * This struct tracks the current usage of a bin, including the number of slabs + * and regions, and calculates the number of full slabs from other fields. + */ +typedef struct jemallocBinUsageData { + size_t curr_slabs; /* Current number of slabs in the bin. */ + size_t curr_nonfull_slabs; /* Current number of non-full slabs in the bin. */ + size_t curr_regs; /* Current number of regions in the bin. */ +} jemallocBinUsageData; + + +static int defrag_supported = 0; +/* Control block holding information about bins and query helper - + * this structure is initialized once when calling allocatorDefragInit. It does not change afterwards*/ +static jemallocCB je_cb = {0, NULL, {{0}, 0}, {{0}, 0}}; +/* Holds the latest usage statistics for each bin. This structure is updated when calling + * allocatorDefragGetFragSmallbins and later is used to make a defrag decision for a memory pointer. */ +static jemallocBinUsageData *je_usage_info = NULL; + + +/* ----------------------------------------------------------------------------- + * Alloc/Free API that are cooperative with defrag + * -------------------------------------------------------------------------- */ + +/* Allocation and free functions that bypass the thread cache + * and go straight to the allocator arena bins. + * Currently implemented only for jemalloc. Used for online defragmentation. + */ +void *allocatorDefragAlloc(size_t size) { + void *ptr = je_mallocx(size, MALLOCX_TCACHE_NONE); + return ptr; +} +void allocatorDefragFree(void *ptr, size_t size) { + if (ptr == NULL) return; + je_sdallocx(ptr, size, MALLOCX_TCACHE_NONE); +} + +/* ----------------------------------------------------------------------------- + * Helper functions for jemalloc translation between size and index + * -------------------------------------------------------------------------- */ + +/* Get the bin index in bin array from the reg_size. + * + * these are reverse engineered mapping of reg_size -> binind. We need this information because the utilization query + * returns the size of the buffer and not the bin index, and we need the bin index to access it's usage information + * + * Note: In case future PR will return the binind (that is better API anyway) we can get rid of + * these conversion functions + */ +static inline unsigned jeSize2BinIndexLgQ3(size_t sz) { + /* Smallest power-of-2 quantum for binning */ + const size_t size_class_group_size = 4; + /* Number of bins in each power-of-2 size class group */ + const size_t lg_quantum_3_first_pow2 = 3; + /* Offset for exponential bins */ + const size_t lg_quantum_3_offset = ((64 >> lg_quantum_3_first_pow2) - 1); + /* Small sizes (8-64 bytes) use linear binning */ + if (sz <= 64) { // 64 = 1 << (lg_quantum_3_first_pow2 + 3) + return (sz >> 3) - 1; // Divide by 8 and subtract 1 + } + + /* For larger sizes, use exponential binning */ + + /* Calculate leading zeros of (sz - 1) to properly handle power-of-2 sizes */ + unsigned leading_zeros = __builtin_clzll(sz - 1); + unsigned exp = 64 - leading_zeros; // Effective log2(sz) + + /* Calculate the size's position within its group */ + unsigned within_group_offset = size_class_group_size - + (((1ULL << exp) - sz) >> (exp - lg_quantum_3_first_pow2)); + + /* Calculate the final bin index */ + return within_group_offset + + ((exp - (lg_quantum_3_first_pow2 + 3)) - 1) * size_class_group_size + + lg_quantum_3_offset; +} +/* ----------------------------------------------------------------------------- + * Interface functions to get fragmentation info from jemalloc + * -------------------------------------------------------------------------- */ +#define ARENA_TO_QUERY MALLCTL_ARENAS_ALL + +static inline void jeRefreshStats(const jemallocCB *je_cb) { + uint64_t epoch = 1; // Value doesn't matter + size_t sz = sizeof(epoch); + /* Refresh stats */ + je_mallctlbymib(je_cb->epoch.key, je_cb->epoch.keylen, &epoch, &sz, &epoch, sz); +} + +/* Extract key that corresponds to the given name for fast query. This should be called once for each key_name */ +static inline int jeQueryKeyInit(const char *key_name, jeMallctlKey *key_info) { + key_info->keylen = sizeof(key_info->key) / sizeof(key_info->key[0]); + int res = je_mallctlnametomib(key_name, key_info->key, &key_info->keylen); + /* sanity check that returned value is not larger than provided */ + assert(key_info->keylen <= sizeof(key_info->key) / sizeof(key_info->key[0])); + return res; +} + +/* Query jemalloc control interface using previously extracted key (with jeQueryKeyInit) instead of name string. + * This interface (named MIB in jemalloc) is faster as it avoids string dict lookup at run-time. */ +static inline int jeQueryCtlInterface(const jeMallctlKey *key_info, void *value) { + size_t sz = sizeof(size_t); + return je_mallctlbymib(key_info->key, key_info->keylen, value, &sz, NULL, 0); +} + +static inline int binQueryHelperInitialization(jeBinInfoKeys *helper, unsigned bin_index) { + char mallctl_name[128]; + + /* Mib of fetch number of used regions in the bin */ + snprintf(mallctl_name, sizeof(mallctl_name), "stats.arenas." STRINGIFY(ARENA_TO_QUERY) ".bins.%d.curregs", bin_index); + if (jeQueryKeyInit(mallctl_name, &helper->curr_regs) != 0) return -1; + /* Mib of fetch number of current slabs in the bin */ + snprintf(mallctl_name, sizeof(mallctl_name), "stats.arenas." STRINGIFY(ARENA_TO_QUERY) ".bins.%d.curslabs", bin_index); + if (jeQueryKeyInit(mallctl_name, &helper->curr_slabs) != 0) return -1; + /* Mib of fetch nonfull slabs */ + snprintf(mallctl_name, sizeof(mallctl_name), "stats.arenas." STRINGIFY(ARENA_TO_QUERY) ".bins.%d.nonfull_slabs", bin_index); + if (jeQueryKeyInit(mallctl_name, &helper->nonfull_slabs) != 0) return -1; + + return 0; +} + +/* Initializes the defragmentation system for the jemalloc memory allocator. + * + * This function performs the necessary setup and initialization steps for the defragmentation system. + * It retrieves the configuration information for the jemalloc arenas and bins, and initializes the usage + * statistics data structure. + * + * return 0 on success, or a non-zero error code on failure. + * + * The initialization process involves the following steps: + * 1. Check if defragmentation is supported by the current jemalloc version. + * 2. Retrieve the arena bin configuration information using the `je_mallctlbymib` function. + * 3. Initialize the `usage_latest` structure with the bin usage statistics and configuration data. + * 4. Set the `defrag_supported` flag to indicate that defragmentation is enabled. + * + * Note: This function must be called before using any other defragmentation-related functionality. + * It should be called during the initialization phase of the code that uses the + * defragmentation feature. + */ +int allocatorDefragInit(void) { + char mallctl_name[100]; + jeBinInfo *bin_info; + size_t sz; + int je_res; + + /* the init should be called only once, fail if unexpected call */ + assert(!defrag_supported); + + /* Get the mib of the per memory pointers query command that is used during defrag scan over memory */ + if (jeQueryKeyInit("experimental.utilization.batch_query", &je_cb.util_batch_query) != 0) return -1; + + je_res = jeQueryKeyInit("epoch", &je_cb.epoch); + assert(je_res == 0); + jeRefreshStats(&je_cb); + + /* get quantum for verification only, current code assumes lg-quantum should be 3 */ + size_t jemalloc_quantum; + sz = sizeof(jemalloc_quantum); + je_mallctl("arenas.quantum", &jemalloc_quantum, &sz, NULL, 0); + /* lg-quantum should be 3 so jemalloc_quantum should be 1<<3 */ + assert(jemalloc_quantum == 8); + + sz = sizeof(je_cb.nbins); + je_res = je_mallctl("arenas.nbins", &je_cb.nbins, &sz, NULL, 0); + assert(je_res == 0 && je_cb.nbins != 0); + + je_cb.bin_info = je_calloc(je_cb.nbins, sizeof(jeBinInfo)); + assert(je_cb.bin_info != NULL); + je_usage_info = je_calloc(je_cb.nbins, sizeof(jemallocBinUsageData)); + assert(je_usage_info != NULL); + + for (unsigned j = 0; j < je_cb.nbins; j++) { + bin_info = &je_cb.bin_info[j]; + /* The size of the current bin */ + snprintf(mallctl_name, sizeof(mallctl_name), "arenas.bin.%d.size", j); + sz = sizeof(bin_info->reg_size); + je_res = je_mallctl(mallctl_name, &bin_info->reg_size, &sz, NULL, 0); + assert(je_res == 0); + /* Number of regions per slab */ + snprintf(mallctl_name, sizeof(mallctl_name), "arenas.bin.%d.nregs", j); + sz = sizeof(bin_info->nregs); + je_res = je_mallctl(mallctl_name, &bin_info->nregs, &sz, NULL, 0); + assert(je_res == 0); + + /* init bin specific fast query keys */ + je_res = binQueryHelperInitialization(&bin_info->info_keys, j); + assert(je_res == 0); + + /* verify the reverse map of reg_size to bin index */ + assert(jeSize2BinIndexLgQ3(bin_info->reg_size) == j); + } + + /* defrag is supported mark it to enable defrag queries */ + defrag_supported = 1; + return 0; +} + +/* Total size of consumed meomry in unused regs in small bins (AKA external fragmentation). + * The function will refresh the epoch. + * + * return total fragmentation bytes + */ +unsigned long allocatorDefragGetFragSmallbins(void) { + assert(defrag_supported); + unsigned long frag = 0; + jeRefreshStats(&je_cb); + for (unsigned j = 0; j < je_cb.nbins; j++) { + jeBinInfo *bin_info = &je_cb.bin_info[j]; + jemallocBinUsageData *bin_usage = &je_usage_info[j]; + + /* Number of current slabs in the bin */ + jeQueryCtlInterface(&bin_info->info_keys.curr_regs, &bin_usage->curr_regs); + /* Number of current slabs in the bin */ + jeQueryCtlInterface(&bin_info->info_keys.curr_slabs, &bin_usage->curr_slabs); + /* Number of non full slabs in the bin */ + jeQueryCtlInterface(&bin_info->info_keys.nonfull_slabs, &bin_usage->curr_nonfull_slabs); + + /* Calculate the fragmentation bytes for the current bin and add it to the total. */ + frag += ((bin_info->nregs * bin_usage->curr_slabs) - bin_usage->curr_regs) * bin_info->reg_size; + } + return frag; +} + +/* Determines whether defragmentation should be performed on a pointer based on jemalloc information. + * + * bin_info Pointer to the bin information structure. + * bin_usage Pointer to the bin usage structure. + * nalloced Number of allocated regions in the bin. + * + * return 1 if defragmentation should be performed, 0 otherwise. + * + * This function checks the following conditions to determine if defragmentation should be performed: + * 1. If the number of allocated regions (nalloced) is equal to the total number of regions (bin_info->nregs), + * defragmentation is not necessary as moving regions is guaranteed not to change the fragmentation ratio. + * 2. If the number of non-full slabs (bin_usage->curr_nonfull_slabs) is less than 2, defragmentation is not performed + * because there is no other slab to move regions to. + * 3. If slab utilization < 'avg utilization'*1.125 [code 1.125 == (1000+UTILIZATION_THRESHOLD_FACTOR_MILI)/1000] + * than we should defrag. This is aligned with previous je_defrag_hint implementation. + */ +static inline int makeDefragDecision(jeBinInfo *bin_info, jemallocBinUsageData *bin_usage, unsigned long nalloced) { + unsigned long curr_full_slabs = bin_usage->curr_slabs - bin_usage->curr_nonfull_slabs; + size_t allocated_nonfull = bin_usage->curr_regs - curr_full_slabs * bin_info->nregs; + if (bin_info->nregs == nalloced || bin_usage->curr_nonfull_slabs < 2 || + 1000 * nalloced * bin_usage->curr_nonfull_slabs > (1000 + UTILIZATION_THRESHOLD_FACTOR_MILI) * allocated_nonfull) { + return 0; + } + return 1; +} + +/* + * Performs defragmentation analysis for a given ptr. + * + * ptr - ptr to memory region to be analyzed. + * + * return - the function returns 1 if defrag should be performed, 0 otherwise. + */ +int allocatorShouldDefrag(void *ptr) { + assert(defrag_supported); + size_t out[BATCH_QUERY_ARGS_OUT]; + size_t out_sz = sizeof(out); + size_t in_sz = sizeof(ptr); + for (unsigned j = 0; j < BATCH_QUERY_ARGS_OUT; j++) { + out[j] = -1; + } + je_mallctlbymib(je_cb.util_batch_query.key, + je_cb.util_batch_query.keylen, + out, &out_sz, + &ptr, in_sz); + /* handle results with appropriate quantum value */ + assert(SLAB_NUM_REGS(out, 0) > 0); + assert(SLAB_LEN(out, 0) > 0); + assert(SLAB_NFREE(out, 0) != (size_t)-1); + unsigned region_size = SLAB_LEN(out, 0) / SLAB_NUM_REGS(out, 0); + /* check that the allocation size is in range of small bins */ + if (region_size > je_cb.bin_info[je_cb.nbins - 1].reg_size) { + return 0; + } + /* get the index based on quantum used */ + unsigned binind = jeSize2BinIndexLgQ3(region_size); + /* make sure binind is in range and reverse map is correct */ + assert(binind < je_cb.nbins && region_size == je_cb.bin_info[binind].reg_size); + + return makeDefragDecision(&je_cb.bin_info[binind], + &je_usage_info[binind], + je_cb.bin_info[binind].nregs - SLAB_NFREE(out, 0)); +} + +#else + +int allocatorDefragInit(void) { + return -1; +} +void allocatorDefragFree(void *ptr, size_t size) { + UNUSED(ptr); + UNUSED(size); +} +__attribute__((malloc)) void *allocatorDefragAlloc(size_t size) { + UNUSED(size); + return NULL; +} +unsigned long allocatorDefragGetFragSmallbins(void) { + return 0; +} + +int allocatorShouldDefrag(void *ptr) { + UNUSED(ptr); + return 0; +} +#endif diff --git a/src/allocator_defrag.h b/src/allocator_defrag.h new file mode 100644 index 0000000000..7fb56208b6 --- /dev/null +++ b/src/allocator_defrag.h @@ -0,0 +1,22 @@ +#ifndef __ALLOCATOR_DEFRAG_H +#define __ALLOCATOR_DEFRAG_H + +#if defined(USE_JEMALLOC) +#include +/* We can enable the server defrag capabilities only if we are using Jemalloc + * and the version that has the experimental.utilization namespace in mallctl . */ +#if defined(JEMALLOC_VERSION_MAJOR) && \ + (JEMALLOC_VERSION_MAJOR > 5 || \ + (JEMALLOC_VERSION_MAJOR == 5 && JEMALLOC_VERSION_MINOR > 2) || \ + (JEMALLOC_VERSION_MAJOR == 5 && JEMALLOC_VERSION_MINOR == 2 && JEMALLOC_VERSION_BUGFIX >= 1)) +#define HAVE_DEFRAG +#endif +#endif + +int allocatorDefragInit(void); +void allocatorDefragFree(void *ptr, size_t size); +__attribute__((malloc)) void *allocatorDefragAlloc(size_t size); +unsigned long allocatorDefragGetFragSmallbins(void); +int allocatorShouldDefrag(void *ptr); + +#endif /* __ALLOCATOR_DEFRAG_H */ diff --git a/src/anet.c b/src/anet.c index d4ac698982..8dc06ca62e 100644 --- a/src/anet.c +++ b/src/anet.c @@ -70,17 +70,24 @@ int anetGetError(int fd) { return sockerr; } -int anetSetBlock(char *err, int fd, int non_block) { +static int anetGetSocketFlags(char *err, int fd) { int flags; - /* Set the socket blocking (if non_block is zero) or non-blocking. - * Note that fcntl(2) for F_GETFL and F_SETFL can't be - * interrupted by a signal. */ if ((flags = fcntl(fd, F_GETFL)) == -1) { anetSetError(err, "fcntl(F_GETFL): %s", strerror(errno)); return ANET_ERR; } + return flags; +} + +int anetSetBlock(char *err, int fd, int non_block) { + int flags = anetGetSocketFlags(err, fd); + + if (flags == ANET_ERR) { + return ANET_ERR; + } + /* Check if this flag has been set or unset, if so, * then there is no need to call fcntl to set/unset it again. */ if (!!(flags & O_NONBLOCK) == !!non_block) return ANET_OK; @@ -105,6 +112,21 @@ int anetBlock(char *err, int fd) { return anetSetBlock(err, fd, 0); } +int anetIsBlock(char *err, int fd) { + int flags = anetGetSocketFlags(err, fd); + + if (flags == ANET_ERR) { + return ANET_ERR; + } + + /* Check if the O_NONBLOCK flag is set */ + if (flags & O_NONBLOCK) { + return 0; /* Socket is non-blocking */ + } else { + return 1; /* Socket is blocking */ + } +} + /* Enable the FD_CLOEXEC on the given fd to avoid fd leaks. * This function should be invoked for fd's on specific places * where fork + execve system calls are called. */ diff --git a/src/anet.h b/src/anet.h index ab32f72e4b..b14b4bdaad 100644 --- a/src/anet.h +++ b/src/anet.h @@ -61,6 +61,7 @@ int anetTcpAccept(char *err, int serversock, char *ip, size_t ip_len, int *port) int anetUnixAccept(char *err, int serversock); int anetNonBlock(char *err, int fd); int anetBlock(char *err, int fd); +int anetIsBlock(char *err, int fd); int anetCloexec(int fd); int anetEnableTcpNoDelay(char *err, int fd); int anetDisableTcpNoDelay(char *err, int fd); diff --git a/src/blocked.c b/src/blocked.c index 8e1974a703..aeec560b3f 100644 --- a/src/blocked.c +++ b/src/blocked.c @@ -206,7 +206,6 @@ void unblockClient(client *c, int queue_for_reprocessing) { /* Reset the client for a new query, unless the client has pending command to process * or in case a shutdown operation was canceled and we are still in the processCommand sequence */ if (!c->flag.pending_command && c->bstate.btype != BLOCKED_SHUTDOWN) { - freeClientOriginalArgv(c); /* Clients that are not blocked on keys are not reprocessed so we must * call reqresAppendResponse here (for clients blocked on key, * unblockClientOnKey is called, which eventually calls processCommand, diff --git a/src/cluster.h b/src/cluster.h index 2e4f33a3c9..142f2d70b3 100644 --- a/src/cluster.h +++ b/src/cluster.h @@ -12,6 +12,12 @@ #define CLUSTER_FAIL 1 /* The cluster can't work */ #define CLUSTER_NAMELEN 40 /* sha1 hex length */ +/* Reason why the cluster state changes to fail. When adding new reasons, + * make sure to update clusterLogFailReason. */ +#define CLUSTER_FAIL_NONE 0 +#define CLUSTER_FAIL_NOT_FULL_COVERAGE 1 +#define CLUSTER_FAIL_MINORITY_PARTITION 2 + /* Redirection errors returned by getNodeByQuery(). */ #define CLUSTER_REDIR_NONE 0 /* Node can serve the request. */ #define CLUSTER_REDIR_CROSS_SLOT 1 /* -CROSSSLOT request. */ @@ -96,7 +102,7 @@ int clusterNodeIsFailing(clusterNode *node); int clusterNodeIsNoFailover(clusterNode *node); char *clusterNodeGetShardId(clusterNode *node); int clusterNodeNumReplicas(clusterNode *node); -clusterNode *clusterNodeGetReplica(clusterNode *node, int slave_idx); +clusterNode *clusterNodeGetReplica(clusterNode *node, int replica_idx); clusterNode *getMigratingSlotDest(int slot); clusterNode *getImportingSlotSource(int slot); clusterNode *getNodeBySlot(int slot); diff --git a/src/cluster_legacy.c b/src/cluster_legacy.c index 43d56b9a09..50a8ffca38 100644 --- a/src/cluster_legacy.c +++ b/src/cluster_legacy.c @@ -1082,6 +1082,7 @@ void clusterInit(void) { server.cluster->myself = NULL; server.cluster->currentEpoch = 0; server.cluster->state = CLUSTER_FAIL; + server.cluster->fail_reason = CLUSTER_FAIL_NONE; server.cluster->size = 0; server.cluster->todo_before_sleep = 0; server.cluster->nodes = dictCreate(&clusterNodesDictType); @@ -2451,6 +2452,7 @@ void clusterUpdateSlotsConfigWith(clusterNode *sender, uint64_t senderConfigEpoc * need to delete all the keys in the slots we lost ownership. */ uint16_t dirty_slots[CLUSTER_SLOTS]; int dirty_slots_count = 0; + int delete_dirty_slots = 0; /* We should detect if sender is new primary of our shard. * We will know it if all our slots were migrated to sender, and sender @@ -2668,7 +2670,8 @@ void clusterUpdateSlotsConfigWith(clusterNode *sender, uint64_t senderConfigEpoc * * If the sender and myself are in the same shard, try psync. */ clusterSetPrimary(sender, !are_in_same_shard, !are_in_same_shard); - clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG | CLUSTER_TODO_UPDATE_STATE | CLUSTER_TODO_FSYNC_CONFIG); + clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG | CLUSTER_TODO_UPDATE_STATE | CLUSTER_TODO_FSYNC_CONFIG | + CLUSTER_TODO_BROADCAST_ALL); } else if (nodeIsPrimary(myself) && (sender_slots >= migrated_our_slots) && !are_in_same_shard) { /* When all our slots are lost to the sender and the sender belongs to * a different shard, this is likely due to a client triggered slot @@ -2677,6 +2680,12 @@ void clusterUpdateSlotsConfigWith(clusterNode *sender, uint64_t senderConfigEpoc serverLog(LL_NOTICE, "My last slot was migrated to node %.40s (%s) in shard %.40s. I am now an empty primary.", sender->name, sender->human_nodename, sender->shard_id); + /* We may still have dirty slots when we became a empty primary due to + * a bad migration. + * + * In order to maintain a consistent state between keys and slots + * we need to remove all the keys from the slots we lost. */ + delete_dirty_slots = 1; } } else if (dirty_slots_count) { /* If we are here, we received an update message which removed @@ -2686,6 +2695,10 @@ void clusterUpdateSlotsConfigWith(clusterNode *sender, uint64_t senderConfigEpoc * * In order to maintain a consistent state between keys and slots * we need to remove all the keys from the slots we lost. */ + delete_dirty_slots = 1; + } + + if (delete_dirty_slots) { for (int j = 0; j < dirty_slots_count; j++) { serverLog(LL_NOTICE, "Deleting keys in dirty slot %d on node %.40s (%s) in shard %.40s", dirty_slots[j], myself->name, myself->human_nodename, myself->shard_id); @@ -2981,7 +2994,7 @@ int clusterIsValidPacket(clusterLink *link) { return 0; } - if (type == server.cluster_drop_packet_filter) { + if (type == server.cluster_drop_packet_filter || server.cluster_drop_packet_filter == -2) { serverLog(LL_WARNING, "Dropping packet that matches debug drop filter"); return 0; } @@ -3070,7 +3083,8 @@ int clusterProcessPacket(clusterLink *link) { if (!clusterIsValidPacket(link)) { clusterMsg *hdr = (clusterMsg *)link->rcvbuf; uint16_t type = ntohs(hdr->type); - if (server.debug_cluster_close_link_on_packet_drop && type == server.cluster_drop_packet_filter) { + if (server.debug_cluster_close_link_on_packet_drop && + (type == server.cluster_drop_packet_filter || server.cluster_drop_packet_filter == -2)) { freeClusterLink(link); serverLog(LL_WARNING, "Closing link for matching packet type %hu", type); return 0; @@ -3134,6 +3148,25 @@ int clusterProcessPacket(clusterLink *link) { if (sender_claims_to_be_primary && sender_claimed_config_epoch > sender->configEpoch) { sender->configEpoch = sender_claimed_config_epoch; clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG | CLUSTER_TODO_FSYNC_CONFIG); + + if (server.cluster->failover_auth_time && server.cluster->failover_auth_sent && + sender->configEpoch >= server.cluster->failover_auth_epoch) { + /* Another node has claimed an epoch greater than or equal to ours. + * If we have an ongoing election, reset it because we cannot win + * with an epoch smaller than or equal to the incoming claim. This + * allows us to start a new election as soon as possible. */ + server.cluster->failover_auth_time = 0; + serverLog(LL_WARNING, + "Failover election in progress for epoch %llu, but received a claim from " + "node %.40s (%s) with an equal or higher epoch %llu. Resetting the election " + "since we cannot win an election in the past.", + (unsigned long long)server.cluster->failover_auth_epoch, + sender->name, sender->human_nodename, + (unsigned long long)sender->configEpoch); + /* Maybe we could start a new election, set a flag here to make sure + * we check as soon as possible, instead of waiting for a cron. */ + clusterDoBeforeSleep(CLUSTER_TODO_HANDLE_FAILOVER); + } } /* Update the replication offset info for this node. */ sender->repl_offset = ntohu64(hdr->offset); @@ -4333,12 +4366,17 @@ void clusterSendFailoverAuthIfNeeded(clusterNode *node, clusterMsg *request) { /* We did not voted for a replica about this primary for two * times the node timeout. This is not strictly needed for correctness - * of the algorithm but makes the base case more linear. */ - if (mstime() - node->replicaof->voted_time < server.cluster_node_timeout * 2) { + * of the algorithm but makes the base case more linear. + * + * This limitation does not restrict manual failover. If a user initiates + * a manual failover, we need to allow it to vote, otherwise the manual + * failover may time out. */ + if (!force_ack && mstime() - node->replicaof->voted_time < server.cluster_node_timeout * 2) { serverLog(LL_WARNING, - "Failover auth denied to %.40s %s: " - "can't vote about this primary before %lld milliseconds", + "Failover auth denied to %.40s (%s): " + "can't vote for any replica of %.40s (%s) within %lld milliseconds", node->name, node->human_nodename, + node->replicaof->name, node->replicaof->human_nodename, (long long)((server.cluster_node_timeout * 2) - (mstime() - node->replicaof->voted_time))); return; } @@ -4364,7 +4402,7 @@ void clusterSendFailoverAuthIfNeeded(clusterNode *node, clusterMsg *request) { /* We can vote for this replica. */ server.cluster->lastVoteEpoch = server.cluster->currentEpoch; - node->replicaof->voted_time = mstime(); + if (!force_ack) node->replicaof->voted_time = mstime(); clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG | CLUSTER_TODO_FSYNC_CONFIG); clusterSendFailoverAuth(node); serverLog(LL_NOTICE, "Failover auth granted to %.40s (%s) for epoch %llu", node->name, node->human_nodename, @@ -4457,7 +4495,7 @@ void clusterLogCantFailover(int reason) { case CLUSTER_CANT_FAILOVER_WAITING_DELAY: msg = "Waiting the delay before I can start a new failover."; break; case CLUSTER_CANT_FAILOVER_EXPIRED: msg = "Failover attempt expired."; break; case CLUSTER_CANT_FAILOVER_WAITING_VOTES: msg = "Waiting for votes, but majority still not reached."; break; - default: msg = "Unknown reason code."; break; + default: serverPanic("Unknown cant failover reason code."); } lastlog_time = time(NULL); serverLog(LL_NOTICE, "Currently unable to failover: %s", msg); @@ -4503,7 +4541,7 @@ void clusterFailoverReplaceYourPrimary(void) { /* 4) Pong all the other nodes so that they can update the state * accordingly and detect that we switched to primary role. */ - clusterBroadcastPong(CLUSTER_BROADCAST_ALL); + clusterDoBeforeSleep(CLUSTER_TODO_BROADCAST_ALL); /* 5) If there was a manual failover in progress, clear the state. */ resetManualFailover(); @@ -4518,8 +4556,9 @@ void clusterFailoverReplaceYourPrimary(void) { * 3) Perform the failover informing all the other nodes. */ void clusterHandleReplicaFailover(void) { + mstime_t now = mstime(); mstime_t data_age; - mstime_t auth_age = mstime() - server.cluster->failover_auth_time; + mstime_t auth_age = now - server.cluster->failover_auth_time; int needed_quorum = (server.cluster->size / 2) + 1; int manual_failover = server.cluster->mf_end != 0 && server.cluster->mf_can_start; mstime_t auth_timeout, auth_retry_time; @@ -4581,7 +4620,7 @@ void clusterHandleReplicaFailover(void) { /* If the previous failover attempt timeout and the retry time has * elapsed, we can setup a new one. */ if (auth_age > auth_retry_time) { - server.cluster->failover_auth_time = mstime() + + server.cluster->failover_auth_time = now + 500 + /* Fixed delay of 500 milliseconds, let FAIL msg propagate. */ random() % 500; /* Random delay between 0 and 500 milliseconds. */ server.cluster->failover_auth_count = 0; @@ -4593,20 +4632,26 @@ void clusterHandleReplicaFailover(void) { server.cluster->failover_auth_time += server.cluster->failover_auth_rank * 1000; /* However if this is a manual failover, no delay is needed. */ if (server.cluster->mf_end) { - server.cluster->failover_auth_time = mstime(); + server.cluster->failover_auth_time = now; server.cluster->failover_auth_rank = 0; - clusterDoBeforeSleep(CLUSTER_TODO_HANDLE_FAILOVER); + /* Reset auth_age since it is outdated now and we can bypass the auth_timeout + * check in the next state and start the election ASAP. */ + auth_age = 0; } serverLog(LL_NOTICE, "Start of election delayed for %lld milliseconds " "(rank #%d, offset %lld).", - server.cluster->failover_auth_time - mstime(), server.cluster->failover_auth_rank, + server.cluster->failover_auth_time - now, server.cluster->failover_auth_rank, replicationGetReplicaOffset()); /* Now that we have a scheduled election, broadcast our offset * to all the other replicas so that they'll updated their offsets * if our offset is better. */ clusterBroadcastPong(CLUSTER_BROADCAST_LOCAL_REPLICAS); - return; + + /* Return ASAP if we can't start the election now. In a manual failover, + * we can start the election immediately, so in this case we continue to + * the next state without waiting for the next beforeSleep. */ + if (now < server.cluster->failover_auth_time) return; } /* It is possible that we received more updated offsets from other @@ -4626,7 +4671,7 @@ void clusterHandleReplicaFailover(void) { } /* Return ASAP if we can't still start the election. */ - if (mstime() < server.cluster->failover_auth_time) { + if (now < server.cluster->failover_auth_time) { clusterLogCantFailover(CLUSTER_CANT_FAILOVER_WAITING_DELAY); return; } @@ -4806,6 +4851,27 @@ void clusterHandleReplicaMigration(int max_replicas) { * data loss due to the asynchronous primary-replica replication. * -------------------------------------------------------------------------- */ +void manualFailoverCanStart(void) { + serverAssert(server.cluster->mf_can_start == 0); + + if (server.cluster->failover_auth_time) { + /* There is another manual failover requested by the user. + * If we have an ongoing election, reset it because the user may initiate + * manual failover again when the previous manual failover timed out. + * Otherwise, if the previous election timed out (see auth_timeout) and + * before the next retry (see auth_retry_time), the new manual failover + * will pause the primary and replica can not do anything to advance the + * manual failover, and then the manual failover eventually times out. */ + server.cluster->failover_auth_time = 0; + serverLog(LL_WARNING, + "Failover election in progress for epoch %llu, but received a new manual failover. " + "Resetting the election.", + (unsigned long long)server.cluster->failover_auth_epoch); + } + + server.cluster->mf_can_start = 1; +} + /* Reset the manual failover state. This works for both primaries and replicas * as all the state about manual failover is cleared. * @@ -4846,7 +4912,7 @@ void clusterHandleManualFailover(void) { if (server.cluster->mf_primary_offset == replicationGetReplicaOffset()) { /* Our replication offset matches the primary replication offset * announced after clients were paused. We can start the failover. */ - server.cluster->mf_can_start = 1; + manualFailoverCanStart(); serverLog(LL_NOTICE, "All primary replication stream processed, " "manual failover can start."); clusterDoBeforeSleep(CLUSTER_TODO_HANDLE_FAILOVER); @@ -4872,6 +4938,8 @@ static int clusterNodeCronHandleReconnect(clusterNode *node, mstime_t handshake_ /* A Node in HANDSHAKE state has a limited lifespan equal to the * configured node timeout. */ if (nodeInHandshake(node) && now - node->ctime > handshake_timeout) { + serverLog(LL_WARNING, "Clusterbus handshake timeout %s:%d after %lldms", node->ip, + node->cport, handshake_timeout); clusterDelNode(node); return 1; } @@ -4964,7 +5032,7 @@ void clusterCron(void) { /* Ping some random node 1 time every 10 iterations, so that we usually ping * one random node every second. */ - if (!(iteration % 10)) { + if (!server.debug_cluster_disable_random_ping && !(iteration % 10)) { int j; /* Check a few random nodes and ping the one with the oldest @@ -5141,6 +5209,13 @@ void clusterBeforeSleep(void) { int fsync = flags & CLUSTER_TODO_FSYNC_CONFIG; clusterSaveConfigOrDie(fsync); } + + if (flags & CLUSTER_TODO_BROADCAST_ALL) { + /* Broadcast a pong to all known nodes. This is useful when something changes + * in the configuration and we want to make the cluster aware it before the + * regular ping. */ + clusterBroadcastPong(CLUSTER_BROADCAST_ALL); + } } void clusterDoBeforeSleep(int flags) { @@ -5289,6 +5364,23 @@ void clusterCloseAllSlots(void) { * Cluster state evaluation function * -------------------------------------------------------------------------- */ +void clusterLogFailReason(int reason) { + if (reason == CLUSTER_FAIL_NONE) return; + + char *msg; + switch (reason) { + case CLUSTER_FAIL_NOT_FULL_COVERAGE: + msg = "At least one hash slot is not served by any available node. " + "Please check the 'cluster-require-full-coverage' configuration."; + break; + case CLUSTER_FAIL_MINORITY_PARTITION: + msg = "I am part of a minority partition."; + break; + default: serverPanic("Unknown fail reason code."); + } + serverLog(LL_WARNING, "Cluster is currently down: %s", msg); +} + /* The following are defines that are only used in the evaluation function * and are based on heuristics. Actually the main point about the rejoin and * writable delay is that they should be a few orders of magnitude larger @@ -5298,7 +5390,7 @@ void clusterCloseAllSlots(void) { #define CLUSTER_WRITABLE_DELAY 2000 void clusterUpdateState(void) { - int j, new_state; + int j, new_state, new_reason; int reachable_primaries = 0; static mstime_t among_minority_time; static mstime_t first_call_time = 0; @@ -5319,12 +5411,14 @@ void clusterUpdateState(void) { /* Start assuming the state is OK. We'll turn it into FAIL if there * are the right conditions. */ new_state = CLUSTER_OK; + new_reason = CLUSTER_FAIL_NONE; /* Check if all the slots are covered. */ if (server.cluster_require_full_coverage) { for (j = 0; j < CLUSTER_SLOTS; j++) { if (server.cluster->slots[j] == NULL || server.cluster->slots[j]->flags & (CLUSTER_NODE_FAIL)) { new_state = CLUSTER_FAIL; + new_reason = CLUSTER_FAIL_NOT_FULL_COVERAGE; break; } } @@ -5359,6 +5453,7 @@ void clusterUpdateState(void) { if (reachable_primaries < needed_quorum) { new_state = CLUSTER_FAIL; + new_reason = CLUSTER_FAIL_MINORITY_PARTITION; among_minority_time = mstime(); } } @@ -5382,7 +5477,21 @@ void clusterUpdateState(void) { serverLog(new_state == CLUSTER_OK ? LL_NOTICE : LL_WARNING, "Cluster state changed: %s", new_state == CLUSTER_OK ? "ok" : "fail"); server.cluster->state = new_state; + + /* Cluster state changes from ok to fail, print a log. */ + if (new_state == CLUSTER_FAIL) { + clusterLogFailReason(new_reason); + server.cluster->fail_reason = new_reason; + } + } + + /* Cluster state is still fail, but the reason has changed, print a log. */ + if (new_state == CLUSTER_FAIL && new_reason != server.cluster->fail_reason) { + clusterLogFailReason(new_reason); + server.cluster->fail_reason = new_reason; } + + if (new_state == CLUSTER_OK) server.cluster->fail_reason = CLUSTER_FAIL_NONE; } /* This function is called after the node startup in order to verify that data @@ -5544,12 +5653,12 @@ sds representClusterNodeFlags(sds ci, uint16_t flags) { * else each slot is added separately. */ sds representSlotInfo(sds ci, uint16_t *slot_info_pairs, int slot_info_pairs_count) { for (int i = 0; i < slot_info_pairs_count; i += 2) { - unsigned long start = slot_info_pairs[i]; - unsigned long end = slot_info_pairs[i + 1]; + unsigned int start = slot_info_pairs[i]; + unsigned int end = slot_info_pairs[i + 1]; if (start == end) { - ci = sdscatfmt(ci, " %i", start); + ci = sdscatfmt(ci, " %u", start); } else { - ci = sdscatfmt(ci, " %i-%i", start, end); + ci = sdscatfmt(ci, " %u-%u", start, end); } } return ci; @@ -6043,8 +6152,11 @@ void removeChannelsInSlot(unsigned int slot) { /* Remove all the keys in the specified hash slot. * The number of removed items is returned. */ unsigned int delKeysInSlot(unsigned int hashslot) { - if (!kvstoreDictSize(server.db->keys, hashslot)) return 0; + if (!countKeysInSlot(hashslot)) return 0; + /* We may lose a slot during the pause. We need to track this + * state so that we don't assert in propagateNow(). */ + server.server_del_keys_in_slot = 1; unsigned int j = 0; kvstoreDictIterator *kvs_di = NULL; @@ -6069,6 +6181,8 @@ unsigned int delKeysInSlot(unsigned int hashslot) { } kvstoreReleaseDictIterator(kvs_di); + server.server_del_keys_in_slot = 0; + serverAssert(server.execution_nesting == 0); return j; } @@ -6486,7 +6600,7 @@ void clusterCommandSetSlot(client *c) { } /* After importing this slot, let the other nodes know as * soon as possible. */ - clusterBroadcastPong(CLUSTER_BROADCAST_ALL); + clusterDoBeforeSleep(CLUSTER_TODO_BROADCAST_ALL); } } } @@ -6518,6 +6632,10 @@ int clusterCommandSpecial(client *c) { addReplyErrorFormat(c, "Invalid node address specified: %s:%s", (char *)c->argv[2]->ptr, (char *)c->argv[3]->ptr); } else { + sds client = catClientInfoShortString(sdsempty(), c, server.hide_user_data_from_log); + serverLog(LL_NOTICE, "Cluster meet %s:%lld (user request from '%s').", (char *)c->argv[2]->ptr, port, + client); + sdsfree(client); addReply(c, shared.ok); } } else if (!strcasecmp(c->argv[1]->ptr, "flushslots") && c->argc == 2) { @@ -6632,6 +6750,9 @@ int clusterCommandSpecial(client *c) { addReplyError(c, "Can't forget my master!"); return 1; } + sds client = catClientInfoShortString(sdsempty(), c, server.hide_user_data_from_log); + serverLog(LL_NOTICE, "Cluster forget %s (user request from '%s').", (char *)c->argv[2]->ptr, client); + sdsfree(client); clusterBlacklistAddNode(n); clusterDelNode(n); clusterDoBeforeSleep(CLUSTER_TODO_UPDATE_STATE | CLUSTER_TODO_SAVE_CONFIG); @@ -6678,8 +6799,7 @@ int clusterCommandSpecial(client *c) { * If the instance is a replica, it had a totally different replication history. * In these both cases, myself as a replica has to do a full sync. */ clusterSetPrimary(n, 1, 1); - clusterBroadcastPong(CLUSTER_BROADCAST_ALL); - clusterDoBeforeSleep(CLUSTER_TODO_UPDATE_STATE | CLUSTER_TODO_SAVE_CONFIG); + clusterDoBeforeSleep(CLUSTER_TODO_UPDATE_STATE | CLUSTER_TODO_SAVE_CONFIG | CLUSTER_TODO_BROADCAST_ALL); addReply(c, shared.ok); } else if (!strcasecmp(c->argv[1]->ptr, "count-failure-reports") && c->argc == 3) { /* CLUSTER COUNT-FAILURE-REPORTS */ @@ -6721,7 +6841,7 @@ int clusterCommandSpecial(client *c) { } resetManualFailover(); server.cluster->mf_end = mstime() + CLUSTER_MF_TIMEOUT; - sds client = catClientInfoString(sdsempty(), c, server.hide_user_data_from_log); + sds client = catClientInfoShortString(sdsempty(), c, server.hide_user_data_from_log); if (takeover) { /* A takeover does not perform any initial check. It just @@ -6736,7 +6856,7 @@ int clusterCommandSpecial(client *c) { * primary to agree about the offset. We just failover taking over * it without coordination. */ serverLog(LL_NOTICE, "Forced failover user request accepted (user request from '%s').", client); - server.cluster->mf_can_start = 1; + manualFailoverCanStart(); /* We can start a manual failover as soon as possible, setting a flag * here so that we don't need to waiting for the cron to kick in. */ clusterDoBeforeSleep(CLUSTER_TODO_HANDLE_MANUALFAILOVER); @@ -6800,6 +6920,9 @@ int clusterCommandSpecial(client *c) { "master nodes containing keys"); return 1; } + sds client = catClientInfoShortString(sdsempty(), c, server.hide_user_data_from_log); + serverLog(LL_NOTICE, "Cluster reset (user request from '%s').", client); + sdsfree(client); clusterReset(hard); addReply(c, shared.ok); } else if (!strcasecmp(c->argv[1]->ptr, "links") && c->argc == 2) { diff --git a/src/cluster_legacy.h b/src/cluster_legacy.h index 5280644e6e..5595402a4d 100644 --- a/src/cluster_legacy.h +++ b/src/cluster_legacy.h @@ -25,6 +25,7 @@ #define CLUSTER_TODO_SAVE_CONFIG (1 << 2) #define CLUSTER_TODO_FSYNC_CONFIG (1 << 3) #define CLUSTER_TODO_HANDLE_MANUALFAILOVER (1 << 4) +#define CLUSTER_TODO_BROADCAST_ALL (1 << 5) /* clusterLink encapsulates everything needed to talk with a remote node. */ typedef struct clusterLink { @@ -338,7 +339,8 @@ struct _clusterNode { mstime_t pong_received; /* Unix time we received the pong */ mstime_t data_received; /* Unix time we received any data */ mstime_t fail_time; /* Unix time when FAIL flag was set */ - mstime_t voted_time; /* Last time we voted for a replica of this primary */ + mstime_t voted_time; /* Last time we voted for a replica of this primary in non manual + * failover scenarios. */ mstime_t repl_offset_time; /* Unix time we received offset for this node */ mstime_t orphaned_time; /* Starting time of orphaned primary condition */ long long repl_offset; /* Last known repl offset for this node. */ @@ -368,6 +370,7 @@ struct clusterState { clusterNode *myself; /* This node */ uint64_t currentEpoch; int state; /* CLUSTER_OK, CLUSTER_FAIL, ... */ + int fail_reason; /* Why the cluster state changes to fail. */ int size; /* Num of primary nodes with at least one slot */ dict *nodes; /* Hash table of name -> clusterNode structures */ dict *shards; /* Hash table of shard_id -> list (of nodes) structures */ diff --git a/src/commands.def b/src/commands.def index 791b30d540..1ac2368ee1 100644 --- a/src/commands.def +++ b/src/commands.def @@ -1230,6 +1230,34 @@ struct COMMAND_ARG CLIENT_CAPA_Args[] = { #define CLIENT_ID_Keyspecs NULL #endif +/********** CLIENT IMPORT_SOURCE ********************/ + +#ifndef SKIP_CMD_HISTORY_TABLE +/* CLIENT IMPORT_SOURCE history */ +#define CLIENT_IMPORT_SOURCE_History NULL +#endif + +#ifndef SKIP_CMD_TIPS_TABLE +/* CLIENT IMPORT_SOURCE tips */ +#define CLIENT_IMPORT_SOURCE_Tips NULL +#endif + +#ifndef SKIP_CMD_KEY_SPECS_TABLE +/* CLIENT IMPORT_SOURCE key specs */ +#define CLIENT_IMPORT_SOURCE_Keyspecs NULL +#endif + +/* CLIENT IMPORT_SOURCE enabled argument table */ +struct COMMAND_ARG CLIENT_IMPORT_SOURCE_enabled_Subargs[] = { +{MAKE_ARG("on",ARG_TYPE_PURE_TOKEN,-1,"ON",NULL,NULL,CMD_ARG_NONE,0,NULL)}, +{MAKE_ARG("off",ARG_TYPE_PURE_TOKEN,-1,"OFF",NULL,NULL,CMD_ARG_NONE,0,NULL)}, +}; + +/* CLIENT IMPORT_SOURCE argument table */ +struct COMMAND_ARG CLIENT_IMPORT_SOURCE_Args[] = { +{MAKE_ARG("enabled",ARG_TYPE_ONEOF,-1,NULL,NULL,NULL,CMD_ARG_NONE,2,NULL),.subargs=CLIENT_IMPORT_SOURCE_enabled_Subargs}, +}; + /********** CLIENT INFO ********************/ #ifndef SKIP_CMD_HISTORY_TABLE @@ -1630,6 +1658,7 @@ struct COMMAND_STRUCT CLIENT_Subcommands[] = { {MAKE_CMD("getredir","Returns the client ID to which the connection's tracking notifications are redirected.","O(1)","6.0.0",CMD_DOC_NONE,NULL,NULL,"connection",COMMAND_GROUP_CONNECTION,CLIENT_GETREDIR_History,0,CLIENT_GETREDIR_Tips,0,clientCommand,2,CMD_NOSCRIPT|CMD_LOADING|CMD_STALE|CMD_SENTINEL,ACL_CATEGORY_CONNECTION,CLIENT_GETREDIR_Keyspecs,0,NULL,0)}, {MAKE_CMD("help","Returns helpful text about the different subcommands.","O(1)","5.0.0",CMD_DOC_NONE,NULL,NULL,"connection",COMMAND_GROUP_CONNECTION,CLIENT_HELP_History,0,CLIENT_HELP_Tips,0,clientCommand,2,CMD_LOADING|CMD_STALE|CMD_SENTINEL,ACL_CATEGORY_CONNECTION,CLIENT_HELP_Keyspecs,0,NULL,0)}, {MAKE_CMD("id","Returns the unique client ID of the connection.","O(1)","5.0.0",CMD_DOC_NONE,NULL,NULL,"connection",COMMAND_GROUP_CONNECTION,CLIENT_ID_History,0,CLIENT_ID_Tips,0,clientCommand,2,CMD_NOSCRIPT|CMD_LOADING|CMD_STALE|CMD_SENTINEL,ACL_CATEGORY_CONNECTION,CLIENT_ID_Keyspecs,0,NULL,0)}, +{MAKE_CMD("import-source","Mark this client as an import source when server is in import mode.","O(1)","8.1.0",CMD_DOC_NONE,NULL,NULL,"connection",COMMAND_GROUP_CONNECTION,CLIENT_IMPORT_SOURCE_History,0,CLIENT_IMPORT_SOURCE_Tips,0,clientCommand,3,CMD_NOSCRIPT|CMD_LOADING|CMD_STALE,ACL_CATEGORY_CONNECTION,CLIENT_IMPORT_SOURCE_Keyspecs,0,NULL,1),.args=CLIENT_IMPORT_SOURCE_Args}, {MAKE_CMD("info","Returns information about the connection.","O(1)","6.2.0",CMD_DOC_NONE,NULL,NULL,"connection",COMMAND_GROUP_CONNECTION,CLIENT_INFO_History,0,CLIENT_INFO_Tips,1,clientCommand,2,CMD_NOSCRIPT|CMD_LOADING|CMD_STALE|CMD_SENTINEL,ACL_CATEGORY_CONNECTION,CLIENT_INFO_Keyspecs,0,NULL,0)}, {MAKE_CMD("kill","Terminates open connections.","O(N) where N is the number of client connections","2.4.0",CMD_DOC_NONE,NULL,NULL,"connection",COMMAND_GROUP_CONNECTION,CLIENT_KILL_History,7,CLIENT_KILL_Tips,0,clientCommand,-3,CMD_ADMIN|CMD_NOSCRIPT|CMD_LOADING|CMD_STALE|CMD_SENTINEL,ACL_CATEGORY_CONNECTION,CLIENT_KILL_Keyspecs,0,NULL,1),.args=CLIENT_KILL_Args}, {MAKE_CMD("list","Lists open connections.","O(N) where N is the number of client connections","2.4.0",CMD_DOC_NONE,NULL,NULL,"connection",COMMAND_GROUP_CONNECTION,CLIENT_LIST_History,7,CLIENT_LIST_Tips,1,clientCommand,-2,CMD_ADMIN|CMD_NOSCRIPT|CMD_LOADING|CMD_STALE|CMD_SENTINEL,ACL_CATEGORY_CONNECTION,CLIENT_LIST_Keyspecs,0,NULL,2),.args=CLIENT_LIST_Args}, @@ -7291,8 +7320,8 @@ struct COMMAND_ARG MEMORY_USAGE_Args[] = { struct COMMAND_STRUCT MEMORY_Subcommands[] = { {MAKE_CMD("doctor","Outputs a memory problems report.","O(1)","4.0.0",CMD_DOC_NONE,NULL,NULL,"server",COMMAND_GROUP_SERVER,MEMORY_DOCTOR_History,0,MEMORY_DOCTOR_Tips,3,memoryCommand,2,0,0,MEMORY_DOCTOR_Keyspecs,0,NULL,0)}, {MAKE_CMD("help","Returns helpful text about the different subcommands.","O(1)","4.0.0",CMD_DOC_NONE,NULL,NULL,"server",COMMAND_GROUP_SERVER,MEMORY_HELP_History,0,MEMORY_HELP_Tips,0,memoryCommand,2,CMD_LOADING|CMD_STALE,0,MEMORY_HELP_Keyspecs,0,NULL,0)}, -{MAKE_CMD("malloc-stats","Returns the allocator statistics.","Depends on how much memory is allocated, could be slow","4.0.0",CMD_DOC_NONE,NULL,NULL,"server",COMMAND_GROUP_SERVER,MEMORY_MALLOC_STATS_History,0,MEMORY_MALLOC_STATS_Tips,3,memoryCommand,2,0,0,MEMORY_MALLOC_STATS_Keyspecs,0,NULL,0)}, -{MAKE_CMD("purge","Asks the allocator to release memory.","Depends on how much memory is allocated, could be slow","4.0.0",CMD_DOC_NONE,NULL,NULL,"server",COMMAND_GROUP_SERVER,MEMORY_PURGE_History,0,MEMORY_PURGE_Tips,2,memoryCommand,2,0,0,MEMORY_PURGE_Keyspecs,0,NULL,0)}, +{MAKE_CMD("malloc-stats","Returns the allocator statistics.","Depends on how much memory is allocated, could be slow","4.0.0",CMD_DOC_NONE,NULL,NULL,"server",COMMAND_GROUP_SERVER,MEMORY_MALLOC_STATS_History,0,MEMORY_MALLOC_STATS_Tips,3,memoryCommand,2,CMD_LOADING,0,MEMORY_MALLOC_STATS_Keyspecs,0,NULL,0)}, +{MAKE_CMD("purge","Asks the allocator to release memory.","Depends on how much memory is allocated, could be slow","4.0.0",CMD_DOC_NONE,NULL,NULL,"server",COMMAND_GROUP_SERVER,MEMORY_PURGE_History,0,MEMORY_PURGE_Tips,2,memoryCommand,2,CMD_LOADING,0,MEMORY_PURGE_Keyspecs,0,NULL,0)}, {MAKE_CMD("stats","Returns details about memory usage.","O(1)","4.0.0",CMD_DOC_NONE,NULL,NULL,"server",COMMAND_GROUP_SERVER,MEMORY_STATS_History,0,MEMORY_STATS_Tips,3,memoryCommand,2,0,0,MEMORY_STATS_Keyspecs,0,NULL,0)}, {MAKE_CMD("usage","Estimates the memory usage of a key.","O(N) where N is the number of samples.","4.0.0",CMD_DOC_NONE,NULL,NULL,"server",COMMAND_GROUP_SERVER,MEMORY_USAGE_History,0,MEMORY_USAGE_Tips,0,memoryCommand,-3,CMD_READONLY,0,MEMORY_USAGE_Keyspecs,1,NULL,2),.args=MEMORY_USAGE_Args}, {0} diff --git a/src/commands/client-import-source.json b/src/commands/client-import-source.json new file mode 100644 index 0000000000..113c07d70a --- /dev/null +++ b/src/commands/client-import-source.json @@ -0,0 +1,40 @@ +{ + "IMPORT-SOURCE": { + "summary": "Mark this client as an import source when server is in import mode.", + "complexity": "O(1)", + "group": "connection", + "since": "8.1.0", + "arity": 3, + "container": "CLIENT", + "function": "clientCommand", + "command_flags": [ + "NOSCRIPT", + "LOADING", + "STALE" + ], + "acl_categories": [ + "CONNECTION" + ], + "reply_schema": { + "const": "OK" + }, + "arguments": [ + { + "name": "enabled", + "type": "oneof", + "arguments": [ + { + "name": "on", + "type": "pure-token", + "token": "ON" + }, + { + "name": "off", + "type": "pure-token", + "token": "OFF" + } + ] + } + ] + } +} \ No newline at end of file diff --git a/src/commands/memory-malloc-stats.json b/src/commands/memory-malloc-stats.json index 5ef6a31c40..af5d439744 100644 --- a/src/commands/memory-malloc-stats.json +++ b/src/commands/memory-malloc-stats.json @@ -12,6 +12,9 @@ "REQUEST_POLICY:ALL_SHARDS", "RESPONSE_POLICY:SPECIAL" ], + "command_flags": [ + "LOADING" + ], "reply_schema": { "type": "string", "description": "The memory allocator's internal statistics report." diff --git a/src/commands/memory-purge.json b/src/commands/memory-purge.json index 77ed61dc5b..aea3e2d24a 100644 --- a/src/commands/memory-purge.json +++ b/src/commands/memory-purge.json @@ -11,6 +11,9 @@ "REQUEST_POLICY:ALL_SHARDS", "RESPONSE_POLICY:ALL_SUCCEEDED" ], + "command_flags": [ + "LOADING" + ], "reply_schema": { "const": "OK" } diff --git a/src/config.c b/src/config.c index f718543c39..5a07c2c0f0 100644 --- a/src/config.c +++ b/src/config.c @@ -1013,15 +1013,14 @@ void configGetCommand(client *c) { #define CONFIG_REWRITE_SIGNATURE "# Generated by CONFIG REWRITE" -/* We use the following dictionary type to store where a configuration - * option is mentioned in the old configuration file, so it's - * like "maxmemory" -> list of line numbers (first line is zero). */ -void dictListDestructor(dict *d, void *val); - /* Sentinel config rewriting is implemented inside sentinel.c by * rewriteConfigSentinelOption(). */ void rewriteConfigSentinelOption(struct rewriteConfigState *state); +/* We use the following dictionary type to store where a configuration + * option is mentioned in the old configuration file, so it's + * like "maxmemory" -> list of line numbers (first line is zero). + */ dictType optionToLineDictType = { dictSdsCaseHash, /* hash function */ NULL, /* key dup */ @@ -1537,10 +1536,27 @@ void rewriteConfigOOMScoreAdjValuesOption(standardConfig *config, const char *na } /* Rewrite the bind option. */ -void rewriteConfigBindOption(standardConfig *config, const char *name, struct rewriteConfigState *state) { +static void rewriteConfigBindOption(standardConfig *config, const char *name, struct rewriteConfigState *state, char **bindaddr, int bindaddr_count) { UNUSED(config); int force = 1; sds line, addresses; + + /* Rewrite as bind ... */ + if (bindaddr_count > 0) + addresses = sdsjoin(bindaddr, bindaddr_count, " "); + else + addresses = sdsnew("\"\""); + line = sdsnew(name); + line = sdscatlen(line, " ", 1); + line = sdscatsds(line, addresses); + sdsfree(addresses); + + rewriteConfigRewriteLine(state, name, line, force); +} + +/* Rewrite the bind option. */ +static void rewriteConfigSocketBindOption(standardConfig *config, const char *name, struct rewriteConfigState *state) { + UNUSED(config); int is_default = 0; /* Compare server.bindaddr with CONFIG_DEFAULT_BINDADDR */ @@ -1560,17 +1576,7 @@ void rewriteConfigBindOption(standardConfig *config, const char *name, struct re return; } - /* Rewrite as bind ... */ - if (server.bindaddr_count > 0) - addresses = sdsjoin(server.bindaddr, server.bindaddr_count, " "); - else - addresses = sdsnew("\"\""); - line = sdsnew(name); - line = sdscatlen(line, " ", 1); - line = sdscatsds(line, addresses); - sdsfree(addresses); - - rewriteConfigRewriteLine(state, name, line, force); + rewriteConfigBindOption(config, name, state, server.bindaddr, server.bindaddr_count); } /* Rewrite the loadmodule option. */ @@ -2638,7 +2644,7 @@ static int applyBind(const char **err) { tcp_listener->ct = connectionByType(CONN_TYPE_SOCKET); if (changeListener(tcp_listener) == C_ERR) { *err = "Failed to bind to specified addresses."; - if (tls_listener) closeListener(tls_listener); /* failed with TLS together */ + if (tls_listener) connCloseListener(tls_listener); /* failed with TLS together */ return 0; } @@ -2650,7 +2656,7 @@ static int applyBind(const char **err) { tls_listener->ct = connectionByType(CONN_TYPE_TLS); if (changeListener(tls_listener) == C_ERR) { *err = "Failed to bind to specified addresses."; - closeListener(tcp_listener); /* failed with TCP together */ + connCloseListener(tcp_listener); /* failed with TCP together */ return 0; } } @@ -2923,8 +2929,9 @@ static sds getConfigNotifyKeyspaceEventsOption(standardConfig *config) { return keyspaceEventsFlagsToString(server.notify_keyspace_events); } -static int setConfigBindOption(standardConfig *config, sds *argv, int argc, const char **err) { +static int setConfigBindOption(standardConfig *config, sds *argv, int argc, const char **err, char **bindaddr, int *bindaddr_count) { UNUSED(config); + int orig_bindaddr_count = *bindaddr_count; int j; if (argc > CONFIG_BINDADDR_MAX) { @@ -2936,11 +2943,73 @@ static int setConfigBindOption(standardConfig *config, sds *argv, int argc, cons if (argc == 1 && sdslen(argv[0]) == 0) argc = 0; /* Free old bind addresses */ - for (j = 0; j < server.bindaddr_count; j++) { - zfree(server.bindaddr[j]); + for (j = 0; j < orig_bindaddr_count; j++) zfree(bindaddr[j]); + for (j = 0; j < argc; j++) bindaddr[j] = zstrdup(argv[j]); + *bindaddr_count = argc; + + return 1; +} + +static int setConfigSocketBindOption(standardConfig *config, sds *argv, int argc, const char **err) { + UNUSED(config); + return setConfigBindOption(config, argv, argc, err, server.bindaddr, &server.bindaddr_count); +} + +static int setConfigRdmaBindOption(standardConfig *config, sds *argv, int argc, const char **err) { + UNUSED(config); + return setConfigBindOption(config, argv, argc, err, server.rdma_ctx_config.bindaddr, &server.rdma_ctx_config.bindaddr_count); +} + +static sds getConfigRdmaBindOption(standardConfig *config) { + UNUSED(config); + return sdsjoin(server.rdma_ctx_config.bindaddr, server.rdma_ctx_config.bindaddr_count, " "); +} + +static void rewriteConfigRdmaBindOption(standardConfig *config, const char *name, struct rewriteConfigState *state) { + UNUSED(config); + + if (server.rdma_ctx_config.bindaddr_count) { + rewriteConfigBindOption(config, name, state, server.rdma_ctx_config.bindaddr, + server.rdma_ctx_config.bindaddr_count); + } +} + +static int applyRdmaBind(const char **err) { + connListener *rdma_listener = listenerByType(CONN_TYPE_RDMA); + + if (!rdma_listener) { + *err = "No RDMA building support."; + return 0; + } + + rdma_listener->bindaddr = server.rdma_ctx_config.bindaddr; + rdma_listener->bindaddr_count = server.rdma_ctx_config.bindaddr_count; + rdma_listener->port = server.rdma_ctx_config.port; + rdma_listener->ct = connectionByType(CONN_TYPE_RDMA); + if (changeListener(rdma_listener) == C_ERR) { + *err = "Failed to bind to specified addresses for RDMA."; + return 0; + } + + return 1; +} + +static int updateRdmaPort(const char **err) { + connListener *listener = listenerByType(CONN_TYPE_RDMA); + + if (listener == NULL) { + *err = "No RDMA building support."; + return 0; + } + + listener->bindaddr = server.rdma_ctx_config.bindaddr; + listener->bindaddr_count = server.rdma_ctx_config.bindaddr_count; + listener->port = server.rdma_ctx_config.port; + listener->ct = connectionByType(CONN_TYPE_RDMA); + if (changeListener(listener) == C_ERR) { + *err = "Unable to listen on this port for RDMA. Check server logs."; + return 0; } - for (j = 0; j < argc; j++) server.bindaddr[j] = zstrdup(argv[j]); - server.bindaddr_count = argc; return 1; } @@ -3140,6 +3209,7 @@ standardConfig static_configs[] = { createBoolConfig("enable-debug-assert", NULL, IMMUTABLE_CONFIG | HIDDEN_CONFIG, server.enable_debug_assert, 0, NULL, NULL), createBoolConfig("cluster-slot-stats-enabled", NULL, MODIFIABLE_CONFIG, server.cluster_slot_stats_enabled, 0, NULL, NULL), createBoolConfig("hide-user-data-from-log", NULL, MODIFIABLE_CONFIG, server.hide_user_data_from_log, 1, NULL, NULL), + createBoolConfig("import-mode", NULL, MODIFIABLE_CONFIG, server.import_mode, 0, NULL, NULL), /* String Configs */ createStringConfig("aclfile", NULL, IMMUTABLE_CONFIG, ALLOW_EMPTY_STRING, server.acl_filename, "", NULL, NULL), @@ -3208,10 +3278,11 @@ standardConfig static_configs[] = { createIntConfig("list-max-listpack-size", "list-max-ziplist-size", MODIFIABLE_CONFIG, INT_MIN, INT_MAX, server.list_max_listpack_size, -2, INTEGER_CONFIG, NULL, NULL), createIntConfig("tcp-keepalive", NULL, MODIFIABLE_CONFIG, 0, INT_MAX, server.tcpkeepalive, 300, INTEGER_CONFIG, NULL, NULL), createIntConfig("cluster-migration-barrier", NULL, MODIFIABLE_CONFIG, 0, INT_MAX, server.cluster_migration_barrier, 1, INTEGER_CONFIG, NULL, NULL), - createIntConfig("active-defrag-cycle-min", NULL, MODIFIABLE_CONFIG, 1, 99, server.active_defrag_cycle_min, 1, INTEGER_CONFIG, NULL, updateDefragConfiguration), /* Default: 1% CPU min (at lower threshold) */ - createIntConfig("active-defrag-cycle-max", NULL, MODIFIABLE_CONFIG, 1, 99, server.active_defrag_cycle_max, 25, INTEGER_CONFIG, NULL, updateDefragConfiguration), /* Default: 25% CPU max (at upper threshold) */ + createIntConfig("active-defrag-cycle-min", NULL, MODIFIABLE_CONFIG, 1, 99, server.active_defrag_cpu_min, 1, INTEGER_CONFIG, NULL, updateDefragConfiguration), /* Default: 1% CPU min (at lower threshold) */ + createIntConfig("active-defrag-cycle-max", NULL, MODIFIABLE_CONFIG, 1, 99, server.active_defrag_cpu_max, 25, INTEGER_CONFIG, NULL, updateDefragConfiguration), /* Default: 25% CPU max (at upper threshold) */ createIntConfig("active-defrag-threshold-lower", NULL, MODIFIABLE_CONFIG, 0, 1000, server.active_defrag_threshold_lower, 10, INTEGER_CONFIG, NULL, NULL), /* Default: don't defrag when fragmentation is below 10% */ createIntConfig("active-defrag-threshold-upper", NULL, MODIFIABLE_CONFIG, 0, 1000, server.active_defrag_threshold_upper, 100, INTEGER_CONFIG, NULL, updateDefragConfiguration), /* Default: maximum defrag force at 100% fragmentation */ + createIntConfig("active-defrag-cycle-us", NULL, MODIFIABLE_CONFIG, 0, 100000, server.active_defrag_cycle_us, 500, INTEGER_CONFIG, NULL, updateDefragConfiguration), createIntConfig("lfu-log-factor", NULL, MODIFIABLE_CONFIG, 0, INT_MAX, server.lfu_log_factor, 10, INTEGER_CONFIG, NULL, NULL), createIntConfig("lfu-decay-time", NULL, MODIFIABLE_CONFIG, 0, INT_MAX, server.lfu_decay_time, 1, INTEGER_CONFIG, NULL, NULL), createIntConfig("replica-priority", "slave-priority", MODIFIABLE_CONFIG, 0, INT_MAX, server.replica_priority, 100, INTEGER_CONFIG, NULL, NULL), @@ -3237,6 +3308,9 @@ standardConfig static_configs[] = { createIntConfig("watchdog-period", NULL, MODIFIABLE_CONFIG | HIDDEN_CONFIG, 0, INT_MAX, server.watchdog_period, 0, INTEGER_CONFIG, NULL, updateWatchdogPeriod), createIntConfig("shutdown-timeout", NULL, MODIFIABLE_CONFIG, 0, INT_MAX, server.shutdown_timeout, 10, INTEGER_CONFIG, NULL, NULL), createIntConfig("repl-diskless-sync-max-replicas", NULL, MODIFIABLE_CONFIG, 0, INT_MAX, server.repl_diskless_sync_max_replicas, 0, INTEGER_CONFIG, NULL, NULL), + createIntConfig("rdma-port", NULL, MODIFIABLE_CONFIG, 0, 65535, server.rdma_ctx_config.port, 0, INTEGER_CONFIG, NULL, updateRdmaPort), + createIntConfig("rdma-rx-size", NULL, IMMUTABLE_CONFIG, 64 * 1024, 16 * 1024 * 1024, server.rdma_ctx_config.rx_size, 1024 * 1024, INTEGER_CONFIG, NULL, NULL), + createIntConfig("rdma-completion-vector", NULL, IMMUTABLE_CONFIG, -1, 1024, server.rdma_ctx_config.completion_vector, -1, INTEGER_CONFIG, NULL, NULL), /* Unsigned int configs */ createUIntConfig("maxclients", NULL, MODIFIABLE_CONFIG, 1, UINT_MAX, server.maxclients, 10000, INTEGER_CONFIG, NULL, updateMaxclients), @@ -3316,7 +3390,8 @@ standardConfig static_configs[] = { createSpecialConfig("client-output-buffer-limit", NULL, MODIFIABLE_CONFIG | MULTI_ARG_CONFIG, setConfigClientOutputBufferLimitOption, getConfigClientOutputBufferLimitOption, rewriteConfigClientOutputBufferLimitOption, NULL), createSpecialConfig("oom-score-adj-values", NULL, MODIFIABLE_CONFIG | MULTI_ARG_CONFIG, setConfigOOMScoreAdjValuesOption, getConfigOOMScoreAdjValuesOption, rewriteConfigOOMScoreAdjValuesOption, updateOOMScoreAdj), createSpecialConfig("notify-keyspace-events", NULL, MODIFIABLE_CONFIG, setConfigNotifyKeyspaceEventsOption, getConfigNotifyKeyspaceEventsOption, rewriteConfigNotifyKeyspaceEventsOption, NULL), - createSpecialConfig("bind", NULL, MODIFIABLE_CONFIG | MULTI_ARG_CONFIG, setConfigBindOption, getConfigBindOption, rewriteConfigBindOption, applyBind), + createSpecialConfig("bind", NULL, MODIFIABLE_CONFIG | MULTI_ARG_CONFIG, setConfigSocketBindOption, getConfigBindOption, rewriteConfigSocketBindOption, applyBind), + createSpecialConfig("rdma-bind", NULL, MODIFIABLE_CONFIG | MULTI_ARG_CONFIG, setConfigRdmaBindOption, getConfigRdmaBindOption, rewriteConfigRdmaBindOption, applyRdmaBind), createSpecialConfig("replicaof", "slaveof", IMMUTABLE_CONFIG | MULTI_ARG_CONFIG, setConfigReplicaOfOption, getConfigReplicaOfOption, rewriteConfigReplicaOfOption, NULL), createSpecialConfig("latency-tracking-info-percentiles", NULL, MODIFIABLE_CONFIG | MULTI_ARG_CONFIG, setConfigLatencyTrackingInfoPercentilesOutputOption, getConfigLatencyTrackingInfoPercentilesOutputOption, rewriteConfigLatencyTrackingInfoPercentilesOutputOption, NULL), diff --git a/src/config.h b/src/config.h index 3b79c5c681..a2e9f353dc 100644 --- a/src/config.h +++ b/src/config.h @@ -364,4 +364,17 @@ void setcpuaffinity(const char *cpulist); #define valkey_prefetch(addr) ((void)(addr)) #endif +/* Check if we can compile AVX2 code */ +#if defined(__x86_64__) && ((defined(__GNUC__) && __GNUC__ >= 5) || (defined(__clang__) && __clang_major__ >= 4)) +#if defined(__has_attribute) && __has_attribute(target) +#define HAVE_AVX2 +#endif +#endif + +#if defined(HAVE_AVX2) +#define ATTRIBUTE_TARGET_AVX2 __attribute__((target("avx2"))) +#else +#define ATTRIBUTE_TARGET_AVX2 +#endif + #endif diff --git a/src/connection.c b/src/connection.c index f0c1c2d364..8807541d77 100644 --- a/src/connection.c +++ b/src/connection.c @@ -66,6 +66,9 @@ int connTypeInitialize(void) { /* may fail if without BUILD_TLS=yes */ RedisRegisterConnectionTypeTLS(); + /* may fail if without BUILD_RDMA=yes */ + RegisterConnectionTypeRdma(); + return C_OK; } diff --git a/src/connection.h b/src/connection.h index 0762441732..8a2775ee34 100644 --- a/src/connection.h +++ b/src/connection.h @@ -60,6 +60,7 @@ typedef enum { #define CONN_TYPE_SOCKET "tcp" #define CONN_TYPE_UNIX "unix" #define CONN_TYPE_TLS "tls" +#define CONN_TYPE_RDMA "rdma" #define CONN_TYPE_MAX 8 /* 8 is enough to be extendable */ typedef void (*ConnectionCallbackFunc)(struct connection *conn); @@ -79,6 +80,7 @@ typedef struct ConnectionType { int (*addr)(connection *conn, char *ip, size_t ip_len, int *port, int remote); int (*is_local)(connection *conn); int (*listen)(connListener *listener); + void (*closeListener)(connListener *listener); /* create/shutdown/close connection */ connection *(*conn_create)(void); @@ -442,6 +444,13 @@ static inline int connListen(connListener *listener) { return listener->ct->listen(listener); } +/* Close a listened listener */ +static inline void connCloseListener(connListener *listener) { + if (listener->count) { + listener->ct->closeListener(listener); + } +} + /* Get accept_handler of a connection type */ static inline aeFileProc *connAcceptHandler(ConnectionType *ct) { if (ct) return ct->accept_handler; @@ -454,6 +463,7 @@ sds getListensInfoString(sds info); int RedisRegisterConnectionTypeSocket(void); int RedisRegisterConnectionTypeUnix(void); int RedisRegisterConnectionTypeTLS(void); +int RegisterConnectionTypeRdma(void); /* Return 1 if connection is using TLS protocol, 0 if otherwise. */ static inline int connIsTLS(connection *conn) { diff --git a/src/db.c b/src/db.c index ceb3105f9b..3c3ccb4899 100644 --- a/src/db.c +++ b/src/db.c @@ -59,6 +59,7 @@ int keyIsExpired(serverDb *db, robj *key); static void dbSetValue(serverDb *db, robj *key, robj *val, int overwrite, dictEntry *de); static int getKVStoreIndexForKey(sds key); dictEntry *dbFindExpiresWithDictIndex(serverDb *db, void *key, int dict_index); +dictEntry *dbFindWithDictIndex(serverDb *db, void *key, int dict_index); /* Update LFU when an object is accessed. * Firstly, decrement the counter if the decrement time is reached. @@ -97,7 +98,8 @@ void updateLFU(robj *val) { * expired on replicas even if the primary is lagging expiring our key via DELs * in the replication link. */ robj *lookupKey(serverDb *db, robj *key, int flags) { - dictEntry *de = dbFind(db, key->ptr); + int dict_index = getKVStoreIndexForKey(key->ptr); + dictEntry *de = dbFindWithDictIndex(db, key->ptr, dict_index); robj *val = NULL; if (de) { val = dictGetVal(de); @@ -113,7 +115,7 @@ robj *lookupKey(serverDb *db, robj *key, int flags) { int expire_flags = 0; if (flags & LOOKUP_WRITE && !is_ro_replica) expire_flags |= EXPIRE_FORCE_DELETE_EXPIRED; if (flags & LOOKUP_NOEXPIRE) expire_flags |= EXPIRE_AVOID_DELETE_EXPIRED; - if (expireIfNeeded(db, key, expire_flags) != KEY_VALID) { + if (expireIfNeededWithDictIndex(db, key, expire_flags, dict_index) != KEY_VALID) { /* The key is no longer valid. */ val = NULL; } @@ -129,7 +131,7 @@ robj *lookupKey(serverDb *db, robj *key, int flags) { if (!hasActiveChildProcess() && !(flags & LOOKUP_NOTOUCH)) { if (!canUseSharedObject() && val->refcount == OBJ_SHARED_REFCOUNT) { val = dupStringObject(val); - kvstoreDictSetVal(db->keys, getKVStoreIndexForKey(key->ptr), de, val); + kvstoreDictSetVal(db->keys, dict_index, de, val); } if (server.maxmemory_policy & MAXMEMORY_FLAG_LFU) { updateLFU(val); @@ -385,10 +387,10 @@ robj *dbRandomKey(serverDb *db) { key = dictGetKey(de); keyobj = createStringObject(key, sdslen(key)); if (dbFindExpiresWithDictIndex(db, key, randomDictIndex)) { - if (allvolatile && server.primary_host && --maxtries == 0) { + if (allvolatile && (server.primary_host || server.import_mode) && --maxtries == 0) { /* If the DB is composed only of keys with an expire set, * it could happen that all the keys are already logically - * expired in the repilca, so the function cannot stop because + * expired in the replica, so the function cannot stop because * expireIfNeeded() is false, nor it can stop because * dictGetFairRandomKey() returns NULL (there are keys to return). * To prevent the infinite loop we do some tries, but if there @@ -572,7 +574,7 @@ long long emptyData(int dbnum, int flags, void(callback)(dict *)) { if (with_functions) { serverAssert(dbnum == -1); - functionsLibCtxClearCurrent(async); + functionsLibCtxClearCurrent(async, callback); } /* Also fire the end event. Note that this event will fire almost @@ -600,12 +602,10 @@ serverDb *initTempDb(void) { return tempDb; } -/* Discard tempDb, this can be slow (similar to FLUSHALL), but it's always async. */ -void discardTempDb(serverDb *tempDb, void(callback)(dict *)) { - int async = 1; - +/* Discard tempDb, it's always async. */ +void discardTempDb(serverDb *tempDb) { /* Release temp DBs. */ - emptyDbStructure(tempDb, -1, async, callback); + emptyDbStructure(tempDb, -1, 1, NULL); for (int i = 0; i < server.dbnum; i++) { kvstoreRelease(tempDb[i].keys); kvstoreRelease(tempDb[i].expires); @@ -830,22 +830,27 @@ void keysCommand(client *c) { kvstoreDictIterator *kvs_di = NULL; kvstoreIterator *kvs_it = NULL; if (pslot != -1) { - if (!kvstoreDictSize(c->db->keys, pslot)) { - /* Requested slot is empty */ - setDeferredArrayLen(c, replylen, 0); - return; - } kvs_di = kvstoreGetDictSafeIterator(c->db->keys, pslot); } else { kvs_it = kvstoreIteratorInit(c->db->keys); } - robj keyobj; - while ((de = kvs_di ? kvstoreDictIteratorNext(kvs_di) : kvstoreIteratorNext(kvs_it)) != NULL) { + while (1) { + robj keyobj; + int dict_index; + if (kvs_di) { + de = kvstoreDictIteratorNext(kvs_di); + dict_index = pslot; + } else { + de = kvstoreIteratorNext(kvs_it); + dict_index = kvstoreIteratorGetCurrentDictIndex(kvs_it); + } + if (de == NULL) break; + sds key = dictGetKey(de); if (allkeys || stringmatchlen(pattern, plen, key, sdslen(key), 0)) { initStaticStringObject(keyobj, key); - if (!keyIsExpired(c->db, &keyobj)) { + if (!keyIsExpiredWithDictIndex(c->db, &keyobj, dict_index)) { addReplyBulkCBuffer(c, key, sdslen(key)); numkeys++; } @@ -1784,7 +1789,7 @@ void propagateDeletion(serverDb *db, robj *key, int lazy) { decrRefCount(argv[1]); } -int keyIsExpiredWithDictIndex(serverDb *db, robj *key, int dict_index) { +static int keyIsExpiredWithDictIndexImpl(serverDb *db, robj *key, int dict_index) { /* Don't expire anything while loading. It will be done later. */ if (server.loading) return 0; @@ -1800,6 +1805,17 @@ int keyIsExpiredWithDictIndex(serverDb *db, robj *key, int dict_index) { return now > when; } +/* Check if the key is expired. */ +int keyIsExpiredWithDictIndex(serverDb *db, robj *key, int dict_index) { + if (!keyIsExpiredWithDictIndexImpl(db, key, dict_index)) return 0; + + /* See expireIfNeededWithDictIndex for more details. */ + if (server.primary_host == NULL && server.import_mode) { + if (server.current_client && server.current_client->flag.import_source) return 0; + } + return 1; +} + /* Check if the key is expired. */ int keyIsExpired(serverDb *db, robj *key) { int dict_index = getKVStoreIndexForKey(key->ptr); @@ -1808,7 +1824,7 @@ int keyIsExpired(serverDb *db, robj *key) { keyStatus expireIfNeededWithDictIndex(serverDb *db, robj *key, int flags, int dict_index) { if (server.lazy_expire_disabled) return KEY_VALID; - if (!keyIsExpiredWithDictIndex(db, key, dict_index)) return KEY_VALID; + if (!keyIsExpiredWithDictIndexImpl(db, key, dict_index)) return KEY_VALID; /* If we are running in the context of a replica, instead of * evicting the expired key from the database, we return ASAP: @@ -1826,6 +1842,25 @@ keyStatus expireIfNeededWithDictIndex(serverDb *db, robj *key, int flags, int di if (server.primary_host != NULL) { if (server.current_client && (server.current_client->flag.primary)) return KEY_VALID; if (!(flags & EXPIRE_FORCE_DELETE_EXPIRED)) return KEY_EXPIRED; + } else if (server.import_mode) { + /* If we are running in the import mode on a primary, instead of + * evicting the expired key from the database, we return ASAP: + * the key expiration is controlled by the import source that will + * send us synthesized DEL operations for expired keys. The + * exception is when write operations are performed on this server + * because it's a primary. + * + * Notice: other clients, apart from the import source, should not access + * the data imported by import source. + * + * Still we try to return the right information to the caller, + * that is, KEY_VALID if we think the key should still be valid, + * KEY_EXPIRED if we think the key is expired but don't want to delete it at this time. + * + * When receiving commands from the import source, keys are never considered + * expired. */ + if (server.current_client && (server.current_client->flag.import_source)) return KEY_VALID; + if (!(flags & EXPIRE_FORCE_DELETE_EXPIRED)) return KEY_EXPIRED; } /* In some cases we're explicitly instructed to return an indication of a @@ -1889,7 +1924,7 @@ keyStatus expireIfNeeded(serverDb *db, robj *key, int flags) { * The purpose is to skip expansion of unused dicts in cluster mode (all * dicts not mapped to *my* slots) */ static int dbExpandSkipSlot(int slot) { - return !clusterNodeCoversSlot(getMyClusterNode(), slot); + return !clusterNodeCoversSlot(clusterNodeGetPrimary(getMyClusterNode()), slot); } /* diff --git a/src/debug.c b/src/debug.c index 98512fd436..7407af3514 100644 --- a/src/debug.c +++ b/src/debug.c @@ -46,6 +46,8 @@ #include #include +#include "valkey_strtod.h" + #ifdef HAVE_BACKTRACE #include #ifndef __OpenBSD__ @@ -432,10 +434,12 @@ void debugCommand(client *c) { " Some fields of the default behavior may be time consuming to fetch,", " and `fast` can be passed to avoid fetching them.", "DROP-CLUSTER-PACKET-FILTER ", - " Drop all packets that match the filtered type. Set to -1 allow all packets.", + " Drop all packets that match the filtered type. Set to -1 allow all packets or -2 to drop all packets.", "CLOSE-CLUSTER-LINK-ON-PACKET-DROP <0|1>", " This is valid only when DROP-CLUSTER-PACKET-FILTER is set to a valid packet type.", " When set to 1, the cluster link is closed after dropping a packet based on the filter.", + "DISABLE-CLUSTER-RANDOM-PING <0|1>", + " Disable sending cluster ping to a random node every second.", "OOM", " Crash the server simulating an out-of-memory error.", "PANIC", @@ -607,6 +611,9 @@ void debugCommand(client *c) { } else if (!strcasecmp(c->argv[1]->ptr, "close-cluster-link-on-packet-drop") && c->argc == 3) { server.debug_cluster_close_link_on_packet_drop = atoi(c->argv[2]->ptr); addReply(c, shared.ok); + } else if (!strcasecmp(c->argv[1]->ptr, "disable-cluster-random-ping") && c->argc == 3) { + server.debug_cluster_disable_random_ping = atoi(c->argv[2]->ptr); + addReply(c, shared.ok); } else if (!strcasecmp(c->argv[1]->ptr, "object") && (c->argc == 3 || c->argc == 4)) { dictEntry *de; robj *val; @@ -841,7 +848,7 @@ void debugCommand(client *c) { "string|integer|double|bignum|null|array|set|map|attrib|push|verbatim|true|false"); } } else if (!strcasecmp(c->argv[1]->ptr, "sleep") && c->argc == 3) { - double dtime = strtod(c->argv[2]->ptr, NULL); + double dtime = valkey_strtod(c->argv[2]->ptr, NULL); long long utime = dtime * 1000000; struct timespec tv; @@ -1023,7 +1030,7 @@ void debugCommand(client *c) { /* =========================== Crash handling ============================== */ -__attribute__((noinline)) void _serverAssert(const char *estr, const char *file, int line) { +__attribute__((noinline, weak)) void _serverAssert(const char *estr, const char *file, int line) { int new_report = bugReportStart(); serverLog(LL_WARNING, "=== %sASSERTION FAILED ===", new_report ? "" : "RECURSIVE "); serverLog(LL_WARNING, "==> %s:%d '%s' is not true", file, line, estr); @@ -1042,6 +1049,14 @@ __attribute__((noinline)) void _serverAssert(const char *estr, const char *file, bugReportEnd(0, 0); } +/* Returns the argv argument in binary representation, limited to length 128. */ +sds getArgvReprString(robj *argv) { + robj *decoded = getDecodedObject(argv); + sds repr = sdscatrepr(sdsempty(), decoded->ptr, min(sdslen(decoded->ptr), 128)); + decrRefCount(decoded); + return repr; +} + /* Checks if the argument at the given index should be redacted from logs. */ int shouldRedactArg(const client *c, int idx) { serverAssert(idx < c->argc); @@ -1066,16 +1081,12 @@ void _serverAssertPrintClientInfo(const client *c) { serverLog(LL_WARNING, "client->argv[%d]: %zu bytes", j, sdslen((sds)c->argv[j]->ptr)); continue; } - char buf[128]; - char *arg; - - if (c->argv[j]->type == OBJ_STRING && sdsEncodedObject(c->argv[j])) { - arg = (char *)c->argv[j]->ptr; - } else { - snprintf(buf, sizeof(buf), "Object type: %u, encoding: %u", c->argv[j]->type, c->argv[j]->encoding); - arg = buf; + sds repr = getArgvReprString(c->argv[j]); + serverLog(LL_WARNING, "client->argv[%d] = %s (refcount: %d)", j, repr, c->argv[j]->refcount); + sdsfree(repr); + if (!strcasecmp(c->argv[j]->ptr, "auth") || !strcasecmp(c->argv[j]->ptr, "auth2")) { + break; } - serverLog(LL_WARNING, "client->argv[%d] = \"%s\" (refcount: %d)", j, arg, c->argv[j]->refcount); } } @@ -1883,23 +1894,18 @@ void logCurrentClient(client *cc, const char *title) { client = catClientInfoString(sdsempty(), cc, server.hide_user_data_from_log); serverLog(LL_WARNING | LL_RAW, "%s\n", client); sdsfree(client); - serverLog(LL_WARNING | LL_RAW, "argc: '%d'\n", cc->argc); + serverLog(LL_WARNING | LL_RAW, "argc: %d\n", cc->argc); for (j = 0; j < cc->argc; j++) { if (shouldRedactArg(cc, j)) { serverLog(LL_WARNING | LL_RAW, "argv[%d]: %zu bytes\n", j, sdslen((sds)cc->argv[j]->ptr)); continue; } - robj *decoded; - decoded = getDecodedObject(cc->argv[j]); - sds repr = sdscatrepr(sdsempty(), decoded->ptr, min(sdslen(decoded->ptr), 128)); - serverLog(LL_WARNING | LL_RAW, "argv[%d]: '%s'\n", j, (char *)repr); - if (!strcasecmp(decoded->ptr, "auth") || !strcasecmp(decoded->ptr, "auth2")) { - sdsfree(repr); - decrRefCount(decoded); + sds repr = getArgvReprString(cc->argv[j]); + serverLog(LL_WARNING | LL_RAW, "argv[%d]: %s\n", j, repr); + sdsfree(repr); + if (!strcasecmp(cc->argv[j]->ptr, "auth") || !strcasecmp(cc->argv[j]->ptr, "auth2")) { break; } - sdsfree(repr); - decrRefCount(decoded); } /* Check if the first argument, usually a key, is found inside the * selected DB, and if so print info about the associated object. */ diff --git a/src/defrag.c b/src/defrag.c index 4d34009f8b..9c195e8959 100644 --- a/src/defrag.c +++ b/src/defrag.c @@ -38,30 +38,129 @@ #ifdef HAVE_DEFRAG -typedef struct defragCtx { - void *privdata; +typedef enum { DEFRAG_NOT_DONE = 0, + DEFRAG_DONE = 1 } doneStatus; + + +/* + * Defragmentation is performed in stages. Each stage is serviced by a stage function + * (defragStageFn). The stage function is passed a target (void*) to defrag. The contents of that + * target are unique to the particular stage - and may even be NULL for some stage functions. The + * same stage function can be used multiple times (for different stages) each having a different + * target. + * + * The stage function is required to maintain an internal static state. This allows the stage + * function to continue when invoked in an iterative manner. When invoked with a 0 endtime, the + * stage function is required to clear it's internal state and prepare to begin a new stage. It + * should return false (more work to do) as it should NOT perform any real "work" during init. + * + * Parameters: + * endtime - This is the monotonic time that the function should end and return. This ensures + * a bounded latency due to defrag. When endtime is 0, the internal state should be + * cleared, preparing to begin the stage with a new target. + * target - This is the "thing" that should be defragged. It's type is dependent on the + * type of the stage function. This might be a dict, a kvstore, a DB, or other. + * privdata - A pointer to arbitrary private data which is unique to the stage function. + * + * Returns: + * - DEFRAG_DONE if the stage is complete + * - DEFRAG_NOT_DONE if there is more work to do + */ +typedef doneStatus (*defragStageFn)(monotime endtime, void *target, void *privdata); + +typedef struct { + defragStageFn stage_fn; // The function to be invoked for the stage + void *target; // The target that the function will defrag + void *privdata; // Private data, unique to the stage function +} StageDescriptor; + +/* Globals needed for the main defrag processing logic. + * Doesn't include variables specific to a stage or type of data. */ +struct DefragContext { + monotime start_cycle; // Time of beginning of defrag cycle + long long start_defrag_hits; // server.stat_active_defrag_hits captured at beginning of cycle + list *remaining_stages; // List of stages which remain to be processed + StageDescriptor *current_stage; // The stage that's currently being processed + + long long timeproc_id; // Eventloop ID of the timerproc (or AE_DELETED_EVENT_ID) + monotime timeproc_end_time; // Ending time of previous timerproc execution + long timeproc_overage_us; // A correction value if over/under target CPU percent +}; +static struct DefragContext defrag; + + +/* There are a number of stages which process a kvstore. To simplify this, a stage helper function + * `defragStageKvstoreHelper()` is defined. This function aids in iterating over the kvstore. It + * uses these definitions. + */ +/* State of the kvstore helper. The private data (privdata) passed to the kvstore helper MUST BEGIN + * with a kvstoreIterState (or be passed as NULL). */ +#define KVS_SLOT_DEFRAG_LUT -2 +#define KVS_SLOT_UNASSIGNED -1 +typedef struct { + kvstore *kvs; int slot; - void *aux; -} defragCtx; + unsigned long cursor; +} kvstoreIterState; +/* The kvstore helper uses this function to perform tasks before continuing the iteration. For the + * main dictionary, large items are set aside and processed by this function before continuing with + * iteration over the kvstore. + * endtime - This is the monotonic time that the function should end and return. + * privdata - Private data for functions invoked by the helper. If provided in the call to + * `defragStageKvstoreHelper()`, the `kvstoreIterState` portion (at the beginning) + * will be updated with the current kvstore iteration status. + * + * Returns: + * - DEFRAG_DONE if the pre-continue work is complete + * - DEFRAG_NOT_DONE if there is more work to do + */ +typedef doneStatus (*kvstoreHelperPreContinueFn)(monotime endtime, void *privdata); + + +// Private data for main dictionary keys +typedef struct { + kvstoreIterState kvstate; + serverDb *db; + dictEntry *saved_expire_de; +} defragKeysCtx; +static_assert(offsetof(defragKeysCtx, kvstate) == 0, "defragStageKvstoreHelper requires this"); + +// Private data for pubsub kvstores +typedef dict *(*getClientChannelsFn)(client *); +typedef struct { + getClientChannelsFn fn; +} getClientChannelsFnWrapper; -typedef struct defragPubSubCtx { - kvstore *pubsub_channels; - dict *(*clientPubSubChannels)(client *); +typedef struct { + kvstoreIterState kvstate; + getClientChannelsFn getPubSubChannels; } defragPubSubCtx; +static_assert(offsetof(defragPubSubCtx, kvstate) == 0, "defragStageKvstoreHelper requires this"); + + +/* When scanning a main kvstore, large elements are queued for later handling rather than + * causing a large latency spike while processing a hash table bucket. This list is only used + * for stage: "defragStageDbKeys". It will only contain values for the current kvstore being + * defragged. + * Note that this is a list of key names. It's possible that the key may be deleted or modified + * before "later" and we will search by key name to find the entry when we defrag the item later. + */ +static list *defrag_later; +static unsigned long defrag_later_cursor; + /* this method was added to jemalloc in order to help us understand which * pointers are worthwhile moving and which aren't */ int je_get_defrag_hint(void *ptr); -/* Defrag helper for generic allocations. - * - * returns NULL in case the allocation wasn't moved. - * when it returns a non-null value, the old pointer was already released - * and should NOT be accessed. */ -void *activeDefragAlloc(void *ptr) { +/* Defrag function which allocates and copies memory if needed, but DOESN'T free the old block. + * It is the responsibility of the caller to free the old block if a non-NULL value (new block) + * is returned. (Returns NULL if no relocation was needed.) + */ +static void *activeDefragAllocWithoutFree(void *ptr, size_t *allocation_size) { size_t size; void *newptr; - if (!je_get_defrag_hint(ptr)) { + if (!allocatorShouldDefrag(ptr)) { server.stat_active_defrag_misses++; return NULL; } @@ -69,30 +168,45 @@ void *activeDefragAlloc(void *ptr) { * make sure not to use the thread cache. so that we don't get back the same * pointers we try to free */ size = zmalloc_size(ptr); - newptr = zmalloc_no_tcache(size); + newptr = allocatorDefragAlloc(size); memcpy(newptr, ptr, size); - zfree_no_tcache(ptr); + if (allocation_size) *allocation_size = size; + server.stat_active_defrag_hits++; return newptr; } +/* Defrag helper for generic allocations. + * + * Returns NULL in case the allocation wasn't moved. + * When it returns a non-null value, the old pointer was already released + * and should NOT be accessed. */ +void *activeDefragAlloc(void *ptr) { + size_t allocation_size; + void *newptr = activeDefragAllocWithoutFree(ptr, &allocation_size); + if (newptr) allocatorDefragFree(ptr, allocation_size); + return newptr; +} + /* This method captures the expiry db dict entry which refers to data stored in keys db dict entry. */ -void defragEntryStartCbForKeys(void *ctx, void *oldptr) { - defragCtx *defragctx = (defragCtx *)ctx; - serverDb *db = defragctx->privdata; +static void defragEntryStartCbForKeys(void *ctx, void *oldptr) { + defragKeysCtx *defragctx = (defragKeysCtx *)ctx; + serverDb *db = defragctx->db; sds oldsds = (sds)dictGetKey((dictEntry *)oldptr); - int slot = defragctx->slot; + int slot = defragctx->kvstate.slot; if (kvstoreDictSize(db->expires, slot)) { dictEntry *expire_de = kvstoreDictFind(db->expires, slot, oldsds); - defragctx->aux = expire_de; + defragctx->saved_expire_de = expire_de; + } else { + defragctx->saved_expire_de = NULL; } } /* This method updates the key of expiry db dict entry. The key might be no longer valid * as it could have been cleaned up during the defrag-realloc of the main dictionary. */ -void defragEntryFinishCbForKeys(void *ctx, void *newptr) { - defragCtx *defragctx = (defragCtx *)ctx; - dictEntry *expire_de = (dictEntry *)defragctx->aux; +static void defragEntryFinishCbForKeys(void *ctx, void *newptr) { + defragKeysCtx *defragctx = (defragKeysCtx *)ctx; + dictEntry *expire_de = defragctx->saved_expire_de; /* Item doesn't have TTL associated to it. */ if (!expire_de) return; /* No reallocation happened. */ @@ -100,18 +214,18 @@ void defragEntryFinishCbForKeys(void *ctx, void *newptr) { expire_de = NULL; return; } - serverDb *db = defragctx->privdata; + serverDb *db = defragctx->db; sds newsds = (sds)dictGetKey((dictEntry *)newptr); - int slot = defragctx->slot; + int slot = defragctx->kvstate.slot; kvstoreDictSetKey(db->expires, slot, expire_de, newsds); } -/*Defrag helper for sds strings +/* Defrag helper for sds strings * - * returns NULL in case the allocation wasn't moved. - * when it returns a non-null value, the old pointer was already released + * Returns NULL in case the allocation wasn't moved. + * When it returns a non-null value, the old pointer was already released * and should NOT be accessed. */ -sds activeDefragSds(sds sdsptr) { +static sds activeDefragSds(sds sdsptr) { void *ptr = sdsAllocPtr(sdsptr); void *newptr = activeDefragAlloc(ptr); if (newptr) { @@ -122,60 +236,48 @@ sds activeDefragSds(sds sdsptr) { return NULL; } -/* Defrag helper for robj and/or string objects with expected refcount. - * - * Like activeDefragStringOb, but it requires the caller to pass in the expected - * reference count. In some cases, the caller needs to update a robj whose - * reference count is not 1, in these cases, the caller must explicitly pass - * in the reference count, otherwise defragmentation will not be performed. - * Note that the caller is responsible for updating any other references to the robj. */ -robj *activeDefragStringObEx(robj *ob, int expected_refcount) { - robj *ret = NULL; - if (ob->refcount != expected_refcount) return NULL; - - /* try to defrag robj (only if not an EMBSTR type (handled below). */ - if (ob->type != OBJ_STRING || ob->encoding != OBJ_ENCODING_EMBSTR) { - if ((ret = activeDefragAlloc(ob))) { - ob = ret; - } +/* Performs defrag on a string-type (or generic) robj, but does not free the old robj. This is the + * caller's responsibility. This is necessary for string objects with multiple references. In this + * case the caller can fix the references before freeing the original object. + */ +static robj *activeDefragStringObWithoutFree(robj *ob, size_t *allocation_size) { + if (ob->type == OBJ_STRING && ob->encoding == OBJ_ENCODING_RAW) { + // Try to defrag the linked sds, regardless of if robj will be moved + sds newsds = activeDefragSds((sds)ob->ptr); + if (newsds) ob->ptr = newsds; } - /* try to defrag string object */ - if (ob->type == OBJ_STRING) { - if (ob->encoding == OBJ_ENCODING_RAW) { - sds newsds = activeDefragSds((sds)ob->ptr); - if (newsds) { - ob->ptr = newsds; - } - } else if (ob->encoding == OBJ_ENCODING_EMBSTR) { - /* The sds is embedded in the object allocation, calculate the - * offset and update the pointer in the new allocation. */ - long ofs = (intptr_t)ob->ptr - (intptr_t)ob; - if ((ret = activeDefragAlloc(ob))) { - ret->ptr = (void *)((intptr_t)ret + ofs); - } - } else if (ob->encoding != OBJ_ENCODING_INT) { - serverPanic("Unknown string encoding"); - } + robj *new_robj = activeDefragAllocWithoutFree(ob, allocation_size); + + if (new_robj && ob->type == OBJ_STRING && ob->encoding == OBJ_ENCODING_EMBSTR) { + // If the robj is moved, correct the internal pointer + long embstr_offset = (intptr_t)ob->ptr - (intptr_t)ob; + new_robj->ptr = (void *)((intptr_t)new_robj + embstr_offset); } - return ret; + return new_robj; } + /* Defrag helper for robj and/or string objects * - * returns NULL in case the allocation wasn't moved. - * when it returns a non-null value, the old pointer was already released + * Returns NULL in case the allocation wasn't moved. + * When it returns a non-null value, the old pointer was already released * and should NOT be accessed. */ robj *activeDefragStringOb(robj *ob) { - return activeDefragStringObEx(ob, 1); + size_t allocation_size; + if (ob->refcount != 1) return NULL; // Unsafe to defrag if multiple refs + robj *new_robj = activeDefragStringObWithoutFree(ob, &allocation_size); + if (new_robj) allocatorDefragFree(ob, allocation_size); + return new_robj; } + /* Defrag helper for lua scripts * - * returns NULL in case the allocation wasn't moved. - * when it returns a non-null value, the old pointer was already released + * Returns NULL in case the allocation wasn't moved. + * When it returns a non-null value, the old pointer was already released * and should NOT be accessed. */ -luaScript *activeDefragLuaScript(luaScript *script) { +static luaScript *activeDefragLuaScript(luaScript *script) { luaScript *ret = NULL; /* try to defrag script struct */ @@ -197,7 +299,7 @@ luaScript *activeDefragLuaScript(luaScript *script) { * Returns NULL in case the allocation wasn't moved. * When it returns a non-null value, the old pointer was already released * and should NOT be accessed. */ -dict *dictDefragTables(dict *d) { +static dict *dictDefragTables(dict *d) { dict *ret = NULL; dictEntry **newtable; /* handle the dict struct */ @@ -215,7 +317,7 @@ dict *dictDefragTables(dict *d) { } /* Internal function used by zslDefrag */ -void zslUpdateNode(zskiplist *zsl, zskiplistNode *oldnode, zskiplistNode *newnode, zskiplistNode **update) { +static void zslUpdateNode(zskiplist *zsl, zskiplistNode *oldnode, zskiplistNode *newnode, zskiplistNode **update) { int i; for (i = 0; i < zsl->level; i++) { if (update[i]->level[i].forward == oldnode) update[i]->level[i].forward = newnode; @@ -237,7 +339,7 @@ void zslUpdateNode(zskiplist *zsl, zskiplistNode *oldnode, zskiplistNode *newnod * only need to defrag the skiplist, but not update the obj pointer. * When return value is non-NULL, it is the score reference that must be updated * in the dict record. */ -double *zslDefrag(zskiplist *zsl, double score, sds oldele, sds newele) { +static double *zslDefrag(zskiplist *zsl, double score, sds oldele, sds newele) { zskiplistNode *update[ZSKIPLIST_MAXLEVEL], *x, *newx; int i; sds ele = newele ? newele : oldele; @@ -271,7 +373,7 @@ double *zslDefrag(zskiplist *zsl, double score, sds oldele, sds newele) { /* Defrag helper for sorted set. * Defrag a single dict entry key name, and corresponding skiplist struct */ -void activeDefragZsetEntry(zset *zs, dictEntry *de) { +static void activeDefragZsetEntry(zset *zs, dictEntry *de) { sds newsds; double *newscore; sds sdsele = dictGetKey(de); @@ -288,13 +390,13 @@ void activeDefragZsetEntry(zset *zs, dictEntry *de) { #define DEFRAG_SDS_DICT_VAL_VOID_PTR 3 #define DEFRAG_SDS_DICT_VAL_LUA_SCRIPT 4 -void activeDefragSdsDictCallback(void *privdata, const dictEntry *de) { +static void activeDefragSdsDictCallback(void *privdata, const dictEntry *de) { UNUSED(privdata); UNUSED(de); } /* Defrag a dict with sds key and optional value (either ptr, sds or robj string) */ -void activeDefragSdsDict(dict *d, int val_type) { +static void activeDefragSdsDict(dict *d, int val_type) { unsigned long cursor = 0; dictDefragFunctions defragfns = { .defragAlloc = activeDefragAlloc, @@ -310,34 +412,7 @@ void activeDefragSdsDict(dict *d, int val_type) { } /* Defrag a list of ptr, sds or robj string values */ -void activeDefragList(list *l, int val_type) { - listNode *ln, *newln; - for (ln = l->head; ln; ln = ln->next) { - if ((newln = activeDefragAlloc(ln))) { - if (newln->prev) - newln->prev->next = newln; - else - l->head = newln; - if (newln->next) - newln->next->prev = newln; - else - l->tail = newln; - ln = newln; - } - if (val_type == DEFRAG_SDS_DICT_VAL_IS_SDS) { - sds newsds, sdsele = ln->value; - if ((newsds = activeDefragSds(sdsele))) ln->value = newsds; - } else if (val_type == DEFRAG_SDS_DICT_VAL_IS_STROB) { - robj *newele, *ele = ln->value; - if ((newele = activeDefragStringOb(ele))) ln->value = newele; - } else if (val_type == DEFRAG_SDS_DICT_VAL_VOID_PTR) { - void *newptr, *ptr = ln->value; - if ((newptr = activeDefragAlloc(ptr))) ln->value = newptr; - } - } -} - -void activeDefragQuickListNode(quicklist *ql, quicklistNode **node_ref) { +static void activeDefragQuickListNode(quicklist *ql, quicklistNode **node_ref) { quicklistNode *newnode, *node = *node_ref; unsigned char *newzl; if ((newnode = activeDefragAlloc(node))) { @@ -354,7 +429,7 @@ void activeDefragQuickListNode(quicklist *ql, quicklistNode **node_ref) { if ((newzl = activeDefragAlloc(node->entry))) node->entry = newzl; } -void activeDefragQuickListNodes(quicklist *ql) { +static void activeDefragQuickListNodes(quicklist *ql) { quicklistNode *node = ql->head; while (node) { activeDefragQuickListNode(ql, &node); @@ -365,13 +440,18 @@ void activeDefragQuickListNodes(quicklist *ql) { /* when the value has lots of elements, we want to handle it later and not as * part of the main dictionary scan. this is needed in order to prevent latency * spikes when handling large items */ -void defragLater(serverDb *db, dictEntry *kde) { +static void defragLater(dictEntry *kde) { + if (!defrag_later) { + defrag_later = listCreate(); + listSetFreeMethod(defrag_later, (void (*)(void *))sdsfree); + defrag_later_cursor = 0; + } sds key = sdsdup(dictGetKey(kde)); - listAddNodeTail(db->defrag_later, key); + listAddNodeTail(defrag_later, key); } /* returns 0 if no more work needs to be been done, and 1 if time is up and more work is needed. */ -long scanLaterList(robj *ob, unsigned long *cursor, long long endtime) { +static long scanLaterList(robj *ob, unsigned long *cursor, monotime endtime) { quicklist *ql = ob->ptr; quicklistNode *node; long iterations = 0; @@ -396,7 +476,7 @@ long scanLaterList(robj *ob, unsigned long *cursor, long long endtime) { activeDefragQuickListNode(ql, &node); server.stat_active_defrag_scanned++; if (++iterations > 128 && !bookmark_failed) { - if (ustime() > endtime) { + if (getMonotonicUs() > endtime) { if (!quicklistBookmarkCreate(&ql, "_AD", node)) { bookmark_failed = 1; } else { @@ -417,14 +497,14 @@ typedef struct { zset *zs; } scanLaterZsetData; -void scanLaterZsetCallback(void *privdata, const dictEntry *_de) { +static void scanLaterZsetCallback(void *privdata, const dictEntry *_de) { dictEntry *de = (dictEntry *)_de; scanLaterZsetData *data = privdata; activeDefragZsetEntry(data->zs, de); server.stat_active_defrag_scanned++; } -void scanLaterZset(robj *ob, unsigned long *cursor) { +static void scanLaterZset(robj *ob, unsigned long *cursor) { if (ob->type != OBJ_ZSET || ob->encoding != OBJ_ENCODING_SKIPLIST) return; zset *zs = (zset *)ob->ptr; dict *d = zs->dict; @@ -434,13 +514,13 @@ void scanLaterZset(robj *ob, unsigned long *cursor) { } /* Used as scan callback when all the work is done in the dictDefragFunctions. */ -void scanCallbackCountScanned(void *privdata, const dictEntry *de) { +static void scanCallbackCountScanned(void *privdata, const dictEntry *de) { UNUSED(privdata); UNUSED(de); server.stat_active_defrag_scanned++; } -void scanLaterSet(robj *ob, unsigned long *cursor) { +static void scanLaterSet(robj *ob, unsigned long *cursor) { if (ob->type != OBJ_SET || ob->encoding != OBJ_ENCODING_HT) return; dict *d = ob->ptr; dictDefragFunctions defragfns = {.defragAlloc = activeDefragAlloc, @@ -448,7 +528,7 @@ void scanLaterSet(robj *ob, unsigned long *cursor) { *cursor = dictScanDefrag(d, *cursor, scanCallbackCountScanned, &defragfns, NULL); } -void scanLaterHash(robj *ob, unsigned long *cursor) { +static void scanLaterHash(robj *ob, unsigned long *cursor) { if (ob->type != OBJ_HASH || ob->encoding != OBJ_ENCODING_HT) return; dict *d = ob->ptr; dictDefragFunctions defragfns = {.defragAlloc = activeDefragAlloc, @@ -457,18 +537,18 @@ void scanLaterHash(robj *ob, unsigned long *cursor) { *cursor = dictScanDefrag(d, *cursor, scanCallbackCountScanned, &defragfns, NULL); } -void defragQuicklist(serverDb *db, dictEntry *kde) { +static void defragQuicklist(dictEntry *kde) { robj *ob = dictGetVal(kde); quicklist *ql = ob->ptr, *newql; serverAssert(ob->type == OBJ_LIST && ob->encoding == OBJ_ENCODING_QUICKLIST); if ((newql = activeDefragAlloc(ql))) ob->ptr = ql = newql; if (ql->len > server.active_defrag_max_scan_fields) - defragLater(db, kde); + defragLater(kde); else activeDefragQuickListNodes(ql); } -void defragZsetSkiplist(serverDb *db, dictEntry *kde) { +static void defragZsetSkiplist(dictEntry *kde) { robj *ob = dictGetVal(kde); zset *zs = (zset *)ob->ptr; zset *newzs; @@ -481,7 +561,7 @@ void defragZsetSkiplist(serverDb *db, dictEntry *kde) { if ((newzsl = activeDefragAlloc(zs->zsl))) zs->zsl = newzsl; if ((newheader = activeDefragAlloc(zs->zsl->header))) zs->zsl->header = newheader; if (dictSize(zs->dict) > server.active_defrag_max_scan_fields) - defragLater(db, kde); + defragLater(kde); else { dictIterator *di = dictGetIterator(zs->dict); while ((de = dictNext(di)) != NULL) { @@ -493,26 +573,26 @@ void defragZsetSkiplist(serverDb *db, dictEntry *kde) { if ((newdict = dictDefragTables(zs->dict))) zs->dict = newdict; } -void defragHash(serverDb *db, dictEntry *kde) { +static void defragHash(dictEntry *kde) { robj *ob = dictGetVal(kde); dict *d, *newd; serverAssert(ob->type == OBJ_HASH && ob->encoding == OBJ_ENCODING_HT); d = ob->ptr; if (dictSize(d) > server.active_defrag_max_scan_fields) - defragLater(db, kde); + defragLater(kde); else activeDefragSdsDict(d, DEFRAG_SDS_DICT_VAL_IS_SDS); /* defrag the dict struct and tables */ if ((newd = dictDefragTables(ob->ptr))) ob->ptr = newd; } -void defragSet(serverDb *db, dictEntry *kde) { +static void defragSet(dictEntry *kde) { robj *ob = dictGetVal(kde); dict *d, *newd; serverAssert(ob->type == OBJ_SET && ob->encoding == OBJ_ENCODING_HT); d = ob->ptr; if (dictSize(d) > server.active_defrag_max_scan_fields) - defragLater(db, kde); + defragLater(kde); else activeDefragSdsDict(d, DEFRAG_SDS_DICT_NO_VAL); /* defrag the dict struct and tables */ @@ -521,7 +601,7 @@ void defragSet(serverDb *db, dictEntry *kde) { /* Defrag callback for radix tree iterator, called for each node, * used in order to defrag the nodes allocations. */ -int defragRaxNode(raxNode **noderef) { +static int defragRaxNode(raxNode **noderef) { raxNode *newnode = activeDefragAlloc(*noderef); if (newnode) { *noderef = newnode; @@ -531,7 +611,7 @@ int defragRaxNode(raxNode **noderef) { } /* returns 0 if no more work needs to be been done, and 1 if time is up and more work is needed. */ -int scanLaterStreamListpacks(robj *ob, unsigned long *cursor, long long endtime) { +static int scanLaterStreamListpacks(robj *ob, unsigned long *cursor, monotime endtime) { static unsigned char last[sizeof(streamID)]; raxIterator ri; long iterations = 0; @@ -567,7 +647,7 @@ int scanLaterStreamListpacks(robj *ob, unsigned long *cursor, long long endtime) if (newdata) raxSetData(ri.node, ri.data = newdata); server.stat_active_defrag_scanned++; if (++iterations > 128) { - if (ustime() > endtime) { + if (getMonotonicUs() > endtime) { serverAssert(ri.key_len == sizeof(last)); memcpy(last, ri.key, ri.key_len); raxStop(&ri); @@ -589,7 +669,7 @@ typedef void *(raxDefragFunction)(raxIterator *ri, void *privdata); * 2) rax nodes * 3) rax entry data (only if defrag_data is specified) * 4) call a callback per element, and allow the callback to return a new pointer for the element */ -void defragRadixTree(rax **raxref, int defrag_data, raxDefragFunction *element_cb, void *element_cb_data) { +static void defragRadixTree(rax **raxref, int defrag_data, raxDefragFunction *element_cb, void *element_cb_data) { raxIterator ri; rax *rax; if ((rax = activeDefragAlloc(*raxref))) *raxref = rax; @@ -612,7 +692,7 @@ typedef struct { streamConsumer *c; } PendingEntryContext; -void *defragStreamConsumerPendingEntry(raxIterator *ri, void *privdata) { +static void *defragStreamConsumerPendingEntry(raxIterator *ri, void *privdata) { PendingEntryContext *ctx = privdata; streamNACK *nack = ri->data, *newnack; nack->consumer = ctx->c; /* update nack pointer to consumer */ @@ -626,7 +706,7 @@ void *defragStreamConsumerPendingEntry(raxIterator *ri, void *privdata) { return newnack; } -void *defragStreamConsumer(raxIterator *ri, void *privdata) { +static void *defragStreamConsumer(raxIterator *ri, void *privdata) { streamConsumer *c = ri->data; streamCG *cg = privdata; void *newc = activeDefragAlloc(c); @@ -642,7 +722,7 @@ void *defragStreamConsumer(raxIterator *ri, void *privdata) { return newc; /* returns NULL if c was not defragged */ } -void *defragStreamConsumerGroup(raxIterator *ri, void *privdata) { +static void *defragStreamConsumerGroup(raxIterator *ri, void *privdata) { streamCG *cg = ri->data; UNUSED(privdata); if (cg->consumers) defragRadixTree(&cg->consumers, 0, defragStreamConsumer, cg); @@ -650,7 +730,7 @@ void *defragStreamConsumerGroup(raxIterator *ri, void *privdata) { return NULL; } -void defragStream(serverDb *db, dictEntry *kde) { +static void defragStream(dictEntry *kde) { robj *ob = dictGetVal(kde); serverAssert(ob->type == OBJ_STREAM && ob->encoding == OBJ_ENCODING_STREAM); stream *s = ob->ptr, *news; @@ -661,7 +741,7 @@ void defragStream(serverDb *db, dictEntry *kde) { if (raxSize(s->rax) > server.active_defrag_max_scan_fields) { rax *newrax = activeDefragAlloc(s->rax); if (newrax) s->rax = newrax; - defragLater(db, kde); + defragLater(kde); } else defragRadixTree(&s->rax, 1, NULL, NULL); @@ -671,25 +751,25 @@ void defragStream(serverDb *db, dictEntry *kde) { /* Defrag a module key. This is either done immediately or scheduled * for later. Returns then number of pointers defragged. */ -void defragModule(serverDb *db, dictEntry *kde) { +static void defragModule(serverDb *db, dictEntry *kde) { robj *obj = dictGetVal(kde); serverAssert(obj->type == OBJ_MODULE); - if (!moduleDefragValue(dictGetKey(kde), obj, db->id)) defragLater(db, kde); + if (!moduleDefragValue(dictGetKey(kde), obj, db->id)) defragLater(kde); } /* for each key we scan in the main dict, this function will attempt to defrag * all the various pointers it has. */ -void defragKey(defragCtx *ctx, dictEntry *de) { - serverDb *db = ctx->privdata; - int slot = ctx->slot; +static void defragKey(defragKeysCtx *ctx, dictEntry *de) { + serverDb *db = ctx->db; + int slot = ctx->kvstate.slot; robj *newob, *ob; unsigned char *newzl; /* Try to defrag robj and / or string value. */ ob = dictGetVal(de); if ((newob = activeDefragStringOb(ob))) { - kvstoreDictSetVal(db->keys, slot, de, newob); + kvstoreDictSetVal(ctx->kvstate.kvs, slot, de, newob); ob = newob; } @@ -697,7 +777,7 @@ void defragKey(defragCtx *ctx, dictEntry *de) { /* Already handled in activeDefragStringOb. */ } else if (ob->type == OBJ_LIST) { if (ob->encoding == OBJ_ENCODING_QUICKLIST) { - defragQuicklist(db, de); + defragQuicklist(de); } else if (ob->encoding == OBJ_ENCODING_LISTPACK) { if ((newzl = activeDefragAlloc(ob->ptr))) ob->ptr = newzl; } else { @@ -705,7 +785,7 @@ void defragKey(defragCtx *ctx, dictEntry *de) { } } else if (ob->type == OBJ_SET) { if (ob->encoding == OBJ_ENCODING_HT) { - defragSet(db, de); + defragSet(de); } else if (ob->encoding == OBJ_ENCODING_INTSET || ob->encoding == OBJ_ENCODING_LISTPACK) { void *newptr, *ptr = ob->ptr; if ((newptr = activeDefragAlloc(ptr))) ob->ptr = newptr; @@ -716,7 +796,7 @@ void defragKey(defragCtx *ctx, dictEntry *de) { if (ob->encoding == OBJ_ENCODING_LISTPACK) { if ((newzl = activeDefragAlloc(ob->ptr))) ob->ptr = newzl; } else if (ob->encoding == OBJ_ENCODING_SKIPLIST) { - defragZsetSkiplist(db, de); + defragZsetSkiplist(de); } else { serverPanic("Unknown sorted set encoding"); } @@ -724,12 +804,12 @@ void defragKey(defragCtx *ctx, dictEntry *de) { if (ob->encoding == OBJ_ENCODING_LISTPACK) { if ((newzl = activeDefragAlloc(ob->ptr))) ob->ptr = newzl; } else if (ob->encoding == OBJ_ENCODING_HT) { - defragHash(db, de); + defragHash(de); } else { serverPanic("Unknown hash encoding"); } } else if (ob->type == OBJ_STREAM) { - defragStream(db, de); + defragStream(de); } else if (ob->type == OBJ_MODULE) { defragModule(db, de); } else { @@ -738,9 +818,9 @@ void defragKey(defragCtx *ctx, dictEntry *de) { } /* Defrag scan callback for the main db dictionary. */ -void defragScanCallback(void *privdata, const dictEntry *de) { +static void dbKeysScanCallback(void *privdata, const dictEntry *de) { long long hits_before = server.stat_active_defrag_hits; - defragKey((defragCtx *)privdata, (dictEntry *)de); + defragKey((defragKeysCtx *)privdata, (dictEntry *)de); if (server.stat_active_defrag_hits != hits_before) server.stat_active_defrag_key_hits++; else @@ -754,10 +834,10 @@ void defragScanCallback(void *privdata, const dictEntry *de) { * fragmentation ratio in order to decide if a defrag action should be taken * or not, a false detection can cause the defragmenter to waste a lot of CPU * without the possibility of getting any results. */ -float getAllocatorFragmentation(size_t *out_frag_bytes) { +static float getAllocatorFragmentation(size_t *out_frag_bytes) { size_t resident, active, allocated, frag_smallbins_bytes; - zmalloc_get_allocator_info(&allocated, &active, &resident, NULL, NULL, &frag_smallbins_bytes); - + zmalloc_get_allocator_info(&allocated, &active, &resident, NULL, NULL); + frag_smallbins_bytes = allocatorDefragGetFragSmallbins(); /* Calculate the fragmentation ratio as the proportion of wasted memory in small * bins (which are defraggable) relative to the total allocated memory (including large bins). * This is because otherwise, if most of the memory usage is large bins, we may show high percentage, @@ -772,18 +852,18 @@ float getAllocatorFragmentation(size_t *out_frag_bytes) { } /* Defrag scan callback for the pubsub dictionary. */ -void defragPubsubScanCallback(void *privdata, const dictEntry *de) { - defragCtx *ctx = privdata; - defragPubSubCtx *pubsub_ctx = ctx->privdata; - kvstore *pubsub_channels = pubsub_ctx->pubsub_channels; +static void defragPubsubScanCallback(void *privdata, const dictEntry *de) { + defragPubSubCtx *ctx = privdata; + kvstore *pubsub_channels = ctx->kvstate.kvs; robj *newchannel, *channel = dictGetKey(de); dict *newclients, *clients = dictGetVal(de); + size_t allocation_size; /* Try to defrag the channel name. */ serverAssert(channel->refcount == (int)dictSize(clients) + 1); - newchannel = activeDefragStringObEx(channel, dictSize(clients) + 1); + newchannel = activeDefragStringObWithoutFree(channel, &allocation_size); if (newchannel) { - kvstoreDictSetKey(pubsub_channels, ctx->slot, (dictEntry *)de, newchannel); + kvstoreDictSetKey(pubsub_channels, ctx->kvstate.slot, (dictEntry *)de, newchannel); /* The channel name is shared by the client's pubsub(shard) and server's * pubsub(shard), after defraging the channel name, we need to update @@ -792,35 +872,26 @@ void defragPubsubScanCallback(void *privdata, const dictEntry *de) { dictEntry *clientde; while ((clientde = dictNext(di)) != NULL) { client *c = dictGetKey(clientde); - dictEntry *pubsub_channel = dictFind(pubsub_ctx->clientPubSubChannels(c), newchannel); + dict *client_channels = ctx->getPubSubChannels(c); + dictEntry *pubsub_channel = dictFind(client_channels, newchannel); serverAssert(pubsub_channel); - dictSetKey(pubsub_ctx->clientPubSubChannels(c), pubsub_channel, newchannel); + dictSetKey(ctx->getPubSubChannels(c), pubsub_channel, newchannel); } dictReleaseIterator(di); + // Now that we're done correcting the references, we can safely free the old channel robj + allocatorDefragFree(channel, allocation_size); } /* Try to defrag the dictionary of clients that is stored as the value part. */ if ((newclients = dictDefragTables(clients))) - kvstoreDictSetVal(pubsub_channels, ctx->slot, (dictEntry *)de, newclients); + kvstoreDictSetVal(pubsub_channels, ctx->kvstate.slot, (dictEntry *)de, newclients); server.stat_active_defrag_scanned++; } -/* We may need to defrag other globals, one small allocation can hold a full allocator run. - * so although small, it is still important to defrag these */ -void defragOtherGlobals(void) { - /* there are many more pointers to defrag (e.g. client argv, output / aof buffers, etc. - * but we assume most of these are short lived, we only need to defrag allocations - * that remain static for a long time */ - activeDefragSdsDict(evalScriptsDict(), DEFRAG_SDS_DICT_VAL_LUA_SCRIPT); - moduleDefragGlobals(); - kvstoreDictLUTDefrag(server.pubsub_channels, dictDefragTables); - kvstoreDictLUTDefrag(server.pubsubshard_channels, dictDefragTables); -} - /* returns 0 more work may or may not be needed (see non-zero cursor), * and 1 if time is up and more work is needed. */ -int defragLaterItem(dictEntry *de, unsigned long *cursor, long long endtime, int dbid) { +static int defragLaterItem(dictEntry *de, unsigned long *cursor, monotime endtime, int dbid) { if (de) { robj *ob = dictGetVal(de); if (ob->type == OBJ_LIST) { @@ -844,299 +915,474 @@ int defragLaterItem(dictEntry *de, unsigned long *cursor, long long endtime, int return 0; } -/* static variables serving defragLaterStep to continue scanning a key from were we stopped last time. */ -static sds defrag_later_current_key = NULL; -static unsigned long defrag_later_cursor = 0; -/* returns 0 if no more work needs to be been done, and 1 if time is up and more work is needed. */ -int defragLaterStep(serverDb *db, int slot, long long endtime) { +// A kvstoreHelperPreContinueFn +static doneStatus defragLaterStep(monotime endtime, void *privdata) { + defragKeysCtx *ctx = privdata; + unsigned int iterations = 0; unsigned long long prev_defragged = server.stat_active_defrag_hits; unsigned long long prev_scanned = server.stat_active_defrag_scanned; - long long key_defragged; - do { - /* if we're not continuing a scan from the last call or loop, start a new one */ - if (!defrag_later_cursor) { - listNode *head = listFirst(db->defrag_later); - - /* Move on to next key */ - if (defrag_later_current_key) { - serverAssert(defrag_later_current_key == head->value); - listDelNode(db->defrag_later, head); - defrag_later_cursor = 0; - defrag_later_current_key = NULL; - } + while (defrag_later && listLength(defrag_later) > 0) { + listNode *head = listFirst(defrag_later); + sds key = head->value; + dictEntry *de = kvstoreDictFind(ctx->kvstate.kvs, ctx->kvstate.slot, key); - /* stop if we reached the last one. */ - head = listFirst(db->defrag_later); - if (!head) return 0; - - /* start a new key */ - defrag_later_current_key = head->value; - defrag_later_cursor = 0; - } - - /* each time we enter this function we need to fetch the key from the dict again (if it still exists) */ - dictEntry *de = kvstoreDictFind(db->keys, slot, defrag_later_current_key); - key_defragged = server.stat_active_defrag_hits; - do { - int quit = 0; - if (defragLaterItem(de, &defrag_later_cursor, endtime, db->id)) - quit = 1; /* time is up, we didn't finish all the work */ - - /* Once in 16 scan iterations, 512 pointer reallocations, or 64 fields - * (if we have a lot of pointers in one hash bucket, or rehashing), - * check if we reached the time limit. */ - if (quit || (++iterations > 16 || server.stat_active_defrag_hits - prev_defragged > 512 || - server.stat_active_defrag_scanned - prev_scanned > 64)) { - if (quit || ustime() > endtime) { - if (key_defragged != server.stat_active_defrag_hits) - server.stat_active_defrag_key_hits++; - else - server.stat_active_defrag_key_misses++; - return 1; - } - iterations = 0; - prev_defragged = server.stat_active_defrag_hits; - prev_scanned = server.stat_active_defrag_scanned; - } - } while (defrag_later_cursor); - if (key_defragged != server.stat_active_defrag_hits) + long long key_defragged = server.stat_active_defrag_hits; + bool timeout = (defragLaterItem(de, &defrag_later_cursor, endtime, ctx->db->id) == 1); + if (key_defragged != server.stat_active_defrag_hits) { server.stat_active_defrag_key_hits++; - else + } else { server.stat_active_defrag_key_misses++; - } while (1); -} + } -#define INTERPOLATE(x, x1, x2, y1, y2) ((y1) + ((x) - (x1)) * ((y2) - (y1)) / ((x2) - (x1))) -#define LIMIT(y, min, max) ((y) < (min) ? min : ((y) > (max) ? max : (y))) + if (timeout) break; -/* decide if defrag is needed, and at what CPU effort to invest in it */ -void computeDefragCycles(void) { - size_t frag_bytes; - float frag_pct = getAllocatorFragmentation(&frag_bytes); - /* If we're not already running, and below the threshold, exit. */ - if (!server.active_defrag_running) { - if (frag_pct < server.active_defrag_threshold_lower || frag_bytes < server.active_defrag_ignore_bytes) return; + if (defrag_later_cursor == 0) { + // the item is finished, move on + listDelNode(defrag_later, head); + } + + if (++iterations > 16 || server.stat_active_defrag_hits - prev_defragged > 512 || + server.stat_active_defrag_scanned - prev_scanned > 64) { + if (getMonotonicUs() > endtime) break; + iterations = 0; + prev_defragged = server.stat_active_defrag_hits; + prev_scanned = server.stat_active_defrag_scanned; + } } - /* Calculate the adaptive aggressiveness of the defrag based on the current - * fragmentation and configurations. */ - int cpu_pct = INTERPOLATE(frag_pct, server.active_defrag_threshold_lower, server.active_defrag_threshold_upper, - server.active_defrag_cycle_min, server.active_defrag_cycle_max); - cpu_pct = LIMIT(cpu_pct, server.active_defrag_cycle_min, server.active_defrag_cycle_max); + return (!defrag_later || listLength(defrag_later) == 0) ? DEFRAG_DONE : DEFRAG_NOT_DONE; +} - /* Normally we allow increasing the aggressiveness during a scan, but don't - * reduce it, since we should not lower the aggressiveness when fragmentation - * drops. But when a configuration is made, we should reconsider it. */ - if (cpu_pct > server.active_defrag_running || server.active_defrag_configuration_changed) { - server.active_defrag_running = cpu_pct; - server.active_defrag_configuration_changed = 0; - serverLog(LL_VERBOSE, "Starting active defrag, frag=%.0f%%, frag_bytes=%zu, cpu=%d%%", frag_pct, frag_bytes, - cpu_pct); + +/* This helper function handles most of the work for iterating over a kvstore. 'privdata', if + * provided, MUST begin with 'kvstoreIterState' and this part is automatically updated by this + * function during the iteration. */ +static doneStatus defragStageKvstoreHelper(monotime endtime, + kvstore *kvs, + dictScanFunction scan_fn, + kvstoreHelperPreContinueFn precontinue_fn, + const dictDefragFunctions *defragfns, + void *privdata) { + static kvstoreIterState state; // STATIC - this persists + if (endtime == 0) { + // Starting the stage, set up the state information for this stage + state.kvs = kvs; + state.slot = KVS_SLOT_DEFRAG_LUT; + state.cursor = 0; + return DEFRAG_NOT_DONE; } -} + serverAssert(kvs == state.kvs); // Shouldn't change during the stage -/* Perform incremental defragmentation work from the serverCron. - * This works in a similar way to activeExpireCycle, in the sense that - * we do incremental work across calls. */ -void activeDefragCycle(void) { - static int slot = -1; - static int current_db = -1; - static int defrag_later_item_in_progress = 0; - static int defrag_stage = 0; - static unsigned long defrag_cursor = 0; - static serverDb *db = NULL; - static long long start_scan, start_stat; unsigned int iterations = 0; unsigned long long prev_defragged = server.stat_active_defrag_hits; unsigned long long prev_scanned = server.stat_active_defrag_scanned; - long long start, timelimit, endtime; - mstime_t latency; - int all_stages_finished = 0; - int quit = 0; - if (!server.active_defrag_enabled) { - if (server.active_defrag_running) { - /* if active defrag was disabled mid-run, start from fresh next time. */ - server.active_defrag_running = 0; - server.active_defrag_configuration_changed = 0; - if (db) listEmpty(db->defrag_later); - defrag_later_current_key = NULL; - defrag_later_cursor = 0; - current_db = -1; - defrag_stage = 0; - defrag_cursor = 0; - slot = -1; - defrag_later_item_in_progress = 0; - db = NULL; - goto update_metrics; + if (state.slot == KVS_SLOT_DEFRAG_LUT) { + // Before we start scanning the kvstore, handle the main structures + do { + state.cursor = kvstoreDictLUTDefrag(kvs, state.cursor, dictDefragTables); + if (getMonotonicUs() >= endtime) return DEFRAG_NOT_DONE; + } while (state.cursor != 0); + state.slot = KVS_SLOT_UNASSIGNED; + } + + while (true) { + if (++iterations > 16 || server.stat_active_defrag_hits - prev_defragged > 512 || server.stat_active_defrag_scanned - prev_scanned > 64) { + if (getMonotonicUs() >= endtime) break; + iterations = 0; + prev_defragged = server.stat_active_defrag_hits; + prev_scanned = server.stat_active_defrag_scanned; } - return; + + if (precontinue_fn) { + if (privdata) *(kvstoreIterState *)privdata = state; + if (precontinue_fn(endtime, privdata) == DEFRAG_NOT_DONE) return DEFRAG_NOT_DONE; + } + + if (!state.cursor) { + // If there's no cursor, we're ready to begin a new kvstore slot. + if (state.slot == KVS_SLOT_UNASSIGNED) { + state.slot = kvstoreGetFirstNonEmptyDictIndex(kvs); + } else { + state.slot = kvstoreGetNextNonEmptyDictIndex(kvs, state.slot); + } + + if (state.slot == KVS_SLOT_UNASSIGNED) return DEFRAG_DONE; + } + + // Whatever privdata's actual type, this function requires that it begins with kvstoreIterState. + if (privdata) *(kvstoreIterState *)privdata = state; + state.cursor = kvstoreDictScanDefrag(kvs, state.slot, state.cursor, + scan_fn, defragfns, privdata); } - if (hasActiveChildProcess()) return; /* Defragging memory while there's a fork will just do damage. */ + return DEFRAG_NOT_DONE; +} + - /* Once a second, check if the fragmentation justfies starting a scan - * or making it more aggressive. */ - run_with_period(1000) { - computeDefragCycles(); +// Note: target is a DB, (not a KVS like most stages) +static doneStatus defragStageDbKeys(monotime endtime, void *target, void *privdata) { + UNUSED(privdata); + serverDb *db = (serverDb *)target; + + static defragKeysCtx ctx; // STATIC - this persists + if (endtime == 0) { + ctx.db = db; + // Don't return yet. Call the helper with endtime==0 below. } + serverAssert(ctx.db == db); - /* Normally it is checked once a second, but when there is a configuration - * change, we want to check it as soon as possible. */ - if (server.active_defrag_configuration_changed) { - computeDefragCycles(); - server.active_defrag_configuration_changed = 0; + /* Note: for DB keys, we use the start/finish callback to fix an expires table entry if + * the main DB entry has been moved. */ + static const dictDefragFunctions defragfns = { + .defragAlloc = activeDefragAlloc, + .defragKey = NULL, // Handled by dbKeysScanCallback + .defragVal = NULL, // Handled by dbKeysScanCallback + .defragEntryStartCb = defragEntryStartCbForKeys, + .defragEntryFinishCb = defragEntryFinishCbForKeys}; + + return defragStageKvstoreHelper(endtime, db->keys, + dbKeysScanCallback, defragLaterStep, &defragfns, &ctx); +} + + +static doneStatus defragStageExpiresKvstore(monotime endtime, void *target, void *privdata) { + UNUSED(privdata); + static const dictDefragFunctions defragfns = { + .defragAlloc = activeDefragAlloc, + .defragKey = NULL, // Not needed for expires (just a ref) + .defragVal = NULL, // Not needed for expires (no value) + }; + return defragStageKvstoreHelper(endtime, (kvstore *)target, + scanCallbackCountScanned, NULL, &defragfns, NULL); +} + + +static doneStatus defragStagePubsubKvstore(monotime endtime, void *target, void *privdata) { + // target is server.pubsub_channels or server.pubsubshard_channels + getClientChannelsFnWrapper *fnWrapper = privdata; + + static const dictDefragFunctions defragfns = { + .defragAlloc = activeDefragAlloc, + .defragKey = NULL, // Handled by defragPubsubScanCallback + .defragVal = NULL, // Not needed for expires (no value) + }; + defragPubSubCtx ctx; + + ctx.getPubSubChannels = fnWrapper->fn; + return defragStageKvstoreHelper(endtime, (kvstore *)target, + defragPubsubScanCallback, NULL, &defragfns, &ctx); +} + + +static doneStatus defragLuaScripts(monotime endtime, void *target, void *privdata) { + UNUSED(target); + UNUSED(privdata); + if (endtime == 0) return DEFRAG_NOT_DONE; // required initialization + activeDefragSdsDict(evalScriptsDict(), DEFRAG_SDS_DICT_VAL_LUA_SCRIPT); + return DEFRAG_DONE; +} + + +static doneStatus defragModuleGlobals(monotime endtime, void *target, void *privdata) { + UNUSED(target); + UNUSED(privdata); + if (endtime == 0) return DEFRAG_NOT_DONE; // required initialization + moduleDefragGlobals(); + return DEFRAG_DONE; +} + + +static bool defragIsRunning(void) { + return (defrag.timeproc_id > 0); +} + + +static void addDefragStage(defragStageFn stage_fn, void *target, void *privdata) { + StageDescriptor *stage = zmalloc(sizeof(StageDescriptor)); + stage->stage_fn = stage_fn; + stage->target = target; + stage->privdata = privdata; + listAddNodeTail(defrag.remaining_stages, stage); +} + + +// Called at the end of a complete defrag cycle, or when defrag is terminated +static void endDefragCycle(bool normal_termination) { + if (normal_termination) { + // For normal termination, we expect... + serverAssert(!defrag.current_stage); + serverAssert(listLength(defrag.remaining_stages) == 0); + serverAssert(!defrag_later || listLength(defrag_later) == 0); + } else { + // Defrag is being terminated abnormally + aeDeleteTimeEvent(server.el, defrag.timeproc_id); + + if (defrag.current_stage) { + zfree(defrag.current_stage); + defrag.current_stage = NULL; + } + listSetFreeMethod(defrag.remaining_stages, zfree); } + defrag.timeproc_id = AE_DELETED_EVENT_ID; - if (!server.active_defrag_running) return; + listRelease(defrag.remaining_stages); + defrag.remaining_stages = NULL; - /* See activeExpireCycle for how timelimit is handled. */ - start = ustime(); - timelimit = 1000000 * server.active_defrag_running / server.hz / 100; - if (timelimit <= 0) timelimit = 1; - endtime = start + timelimit; - latencyStartMonitor(latency); + if (defrag_later) { + listRelease(defrag_later); + defrag_later = NULL; + } + defrag_later_cursor = 0; - dictDefragFunctions defragfns = {.defragAlloc = activeDefragAlloc, - .defragEntryStartCb = defragEntryStartCbForKeys, - .defragEntryFinishCb = defragEntryFinishCbForKeys}; - do { - /* if we're not continuing a scan from the last call or loop, start a new one */ - if (!defrag_stage && !defrag_cursor && (slot < 0)) { - /* finish any leftovers from previous db before moving to the next one */ - if (db && defragLaterStep(db, slot, endtime)) { - quit = 1; /* time is up, we didn't finish all the work */ - break; /* this will exit the function and we'll continue on the next cycle */ - } + size_t frag_bytes; + float frag_pct = getAllocatorFragmentation(&frag_bytes); + serverLog(LL_VERBOSE, "Active defrag done in %dms, reallocated=%d, frag=%.0f%%, frag_bytes=%zu", + (int)elapsedMs(defrag.start_cycle), (int)(server.stat_active_defrag_hits - defrag.start_defrag_hits), + frag_pct, frag_bytes); - /* Move on to next database, and stop if we reached the last one. */ - if (++current_db >= server.dbnum) { - /* defrag other items not part of the db / keys */ - defragOtherGlobals(); - - long long now = ustime(); - size_t frag_bytes; - float frag_pct = getAllocatorFragmentation(&frag_bytes); - serverLog(LL_VERBOSE, "Active defrag done in %dms, reallocated=%d, frag=%.0f%%, frag_bytes=%zu", - (int)((now - start_scan) / 1000), (int)(server.stat_active_defrag_hits - start_stat), - frag_pct, frag_bytes); - - start_scan = now; - current_db = -1; - defrag_stage = 0; - defrag_cursor = 0; - slot = -1; - defrag_later_item_in_progress = 0; - db = NULL; - server.active_defrag_running = 0; - - computeDefragCycles(); /* if another scan is needed, start it right away */ - if (server.active_defrag_running != 0 && ustime() < endtime) continue; - break; - } else if (current_db == 0) { - /* Start a scan from the first database. */ - start_scan = ustime(); - start_stat = server.stat_active_defrag_hits; - } + server.stat_total_active_defrag_time += elapsedUs(server.stat_last_active_defrag_time); + server.stat_last_active_defrag_time = 0; + server.active_defrag_cpu_percent = 0; +} + + +/* Must be called at the start of the timeProc as it measures the delay from the end of the previous + * timeProc invocation when performing the computation. */ +static int computeDefragCycleUs(void) { + long dutyCycleUs; - db = &server.db[current_db]; - kvstoreDictLUTDefrag(db->keys, dictDefragTables); - kvstoreDictLUTDefrag(db->expires, dictDefragTables); - defrag_stage = 0; - defrag_cursor = 0; - slot = -1; - defrag_later_item_in_progress = 0; + int targetCpuPercent = server.active_defrag_cpu_percent; + serverAssert(targetCpuPercent > 0 && targetCpuPercent < 100); + + static int prevCpuPercent = 0; // STATIC - this persists + if (targetCpuPercent != prevCpuPercent) { + /* If the targetCpuPercent changes, the value might be different from when the last wait + * time was computed. In this case, don't consider wait time. (This is really only an + * issue in crazy tests that dramatically increase CPU while defrag is running.) */ + defrag.timeproc_end_time = 0; + prevCpuPercent = targetCpuPercent; + } + + // Given when the last duty cycle ended, compute time needed to achieve the desired percentage. + if (defrag.timeproc_end_time == 0) { + // Either the first call to the timeProc, or we were paused for some reason. + defrag.timeproc_overage_us = 0; + dutyCycleUs = server.active_defrag_cycle_us; + } else { + long waitedUs = getMonotonicUs() - defrag.timeproc_end_time; + /* Given the elapsed wait time between calls, compute the necessary duty time needed to + * achieve the desired CPU percentage. + * With: D = duty time, W = wait time, P = percent + * Solve: D P + * ----- = ----- + * D + W 100 + * Solving for D: + * D = P * W / (100 - P) + * + * Note that dutyCycleUs addresses starvation. If the wait time was long, we will compensate + * with a proportionately long duty-cycle. This won't significantly affect perceived + * latency, because clients are already being impacted by the long cycle time which caused + * the starvation of the timer. */ + dutyCycleUs = targetCpuPercent * waitedUs / (100 - targetCpuPercent); + + // Also adjust for any accumulated overage(underage). + dutyCycleUs -= defrag.timeproc_overage_us; + defrag.timeproc_overage_us = 0; + + if (dutyCycleUs < server.active_defrag_cycle_us) { + /* We never reduce our cycle time, that would increase overhead. Instead, we track this + * as part of the overage, and increase wait time between cycles. */ + defrag.timeproc_overage_us = server.active_defrag_cycle_us - dutyCycleUs; + dutyCycleUs = server.active_defrag_cycle_us; } + } + return dutyCycleUs; +} - /* This array of structures holds the parameters for all defragmentation stages. */ - typedef struct defragStage { - kvstore *kvs; - dictScanFunction *scanfn; - void *privdata; - } defragStage; - defragStage defrag_stages[] = { - {db->keys, defragScanCallback, db}, - {db->expires, scanCallbackCountScanned, NULL}, - {server.pubsub_channels, defragPubsubScanCallback, - &(defragPubSubCtx){server.pubsub_channels, getClientPubSubChannels}}, - {server.pubsubshard_channels, defragPubsubScanCallback, - &(defragPubSubCtx){server.pubsubshard_channels, getClientPubSubShardChannels}}, - }; - do { - int num_stages = sizeof(defrag_stages) / sizeof(defrag_stages[0]); - serverAssert(defrag_stage < num_stages); - defragStage *current_stage = &defrag_stages[defrag_stage]; - - /* before scanning the next bucket, see if we have big keys left from the previous bucket to scan */ - if (defragLaterStep(db, slot, endtime)) { - quit = 1; /* time is up, we didn't finish all the work */ - break; /* this will exit the function and we'll continue on the next cycle */ - } - if (!defrag_later_item_in_progress) { - /* Continue defragmentation from the previous stage. - * If slot is -1, it means this stage starts from the first non-empty slot. */ - if (slot == -1) slot = kvstoreGetFirstNonEmptyDictIndex(current_stage->kvs); - defrag_cursor = kvstoreDictScanDefrag(current_stage->kvs, slot, defrag_cursor, current_stage->scanfn, - &defragfns, &(defragCtx){current_stage->privdata, slot}); - } +/* Must be called at the end of the timeProc as it records the timeproc_end_time for use in the next + * computeDefragCycleUs computation. */ +static int computeDelayMs(monotime intendedEndtime) { + defrag.timeproc_end_time = getMonotonicUs(); + int overage = defrag.timeproc_end_time - intendedEndtime; + defrag.timeproc_overage_us += overage; // track over/under desired CPU + + int targetCpuPercent = server.active_defrag_cpu_percent; + serverAssert(targetCpuPercent > 0 && targetCpuPercent < 100); + + // Given the desired duty cycle, what inter-cycle delay do we need to achieve that? + // We want to achieve a specific CPU percent. To do that, we can't use a skewed computation. + // Example, if we run for 1ms and delay 10ms, that's NOT 10%, because the total cycle time is 11ms. + // Instead, if we rum for 1ms, our total time should be 10ms. So the delay is only 9ms. + long totalCycleTimeUs = server.active_defrag_cycle_us * 100 / targetCpuPercent; + long delayUs = totalCycleTimeUs - server.active_defrag_cycle_us; + // Only increase delay by the fraction of the overage that would be non-duty-cycle + delayUs += defrag.timeproc_overage_us * (100 - targetCpuPercent) / 100; // "overage" might be negative + if (delayUs < 0) delayUs = 0; + long delayMs = delayUs / 1000; // round down + return delayMs; +} - if (!defrag_cursor) { - /* Move to the next slot only if regular and large item scanning has been completed. */ - if (listLength(db->defrag_later) > 0) { - defrag_later_item_in_progress = 1; - continue; - } - /* Move to the next slot in the current stage. If we've reached the end, move to the next stage. */ - if ((slot = kvstoreGetNextNonEmptyDictIndex(current_stage->kvs, slot)) == -1) defrag_stage++; - defrag_later_item_in_progress = 0; - } +/* An independent time proc for defrag. While defrag is running, this is called much more often + * than the server cron. Frequent short calls provides low latency impact. */ +static long long activeDefragTimeProc(struct aeEventLoop *eventLoop, long long id, void *clientData) { + UNUSED(eventLoop); + UNUSED(id); + UNUSED(clientData); - /* Check if all defragmentation stages have been processed. - * If so, mark as finished and reset the stage counter to move on to next database. */ - if (defrag_stage == num_stages) { - all_stages_finished = 1; - defrag_stage = 0; - } + // This timer shouldn't be registered unless there's work to do. + serverAssert(defrag.current_stage || listLength(defrag.remaining_stages) > 0); - /* Once in 16 scan iterations, 512 pointer reallocations. or 64 keys - * (if we have a lot of pointers in one hash bucket or rehashing), - * check if we reached the time limit. - * But regardless, don't start a new db in this loop, this is because after - * the last db we call defragOtherGlobals, which must be done in one cycle */ - if (all_stages_finished || ++iterations > 16 || server.stat_active_defrag_hits - prev_defragged > 512 || - server.stat_active_defrag_scanned - prev_scanned > 64) { - /* Quit if all stages were finished or timeout. */ - if (all_stages_finished || ustime() > endtime) { - quit = 1; - break; - } - iterations = 0; - prev_defragged = server.stat_active_defrag_hits; - prev_scanned = server.stat_active_defrag_scanned; - } - } while (!all_stages_finished && !quit); - } while (!quit); + if (!server.active_defrag_enabled) { + // Defrag has been disabled while running + endDefragCycle(false); + return AE_NOMORE; + } + + if (hasActiveChildProcess()) { + // If there's a child process, pause the defrag, polling until the child completes. + defrag.timeproc_end_time = 0; // prevent starvation recovery + return 100; + } + + monotime starttime = getMonotonicUs(); + monotime endtime = starttime + computeDefragCycleUs(); + + mstime_t latency; + latencyStartMonitor(latency); + + if (!defrag.current_stage) { + defrag.current_stage = listNodeValue(listFirst(defrag.remaining_stages)); + listDelNode(defrag.remaining_stages, listFirst(defrag.remaining_stages)); + // Initialize the stage with endtime==0 + doneStatus status = defrag.current_stage->stage_fn(0, defrag.current_stage->target, defrag.current_stage->privdata); + serverAssert(status == DEFRAG_NOT_DONE); // Initialization should always return DEFRAG_NOT_DONE + } + + doneStatus status = defrag.current_stage->stage_fn(endtime, defrag.current_stage->target, defrag.current_stage->privdata); + if (status == DEFRAG_DONE) { + zfree(defrag.current_stage); + defrag.current_stage = NULL; + } latencyEndMonitor(latency); latencyAddSampleIfNeeded("active-defrag-cycle", latency); -update_metrics: - if (server.active_defrag_running > 0) { - if (server.stat_last_active_defrag_time == 0) elapsedStart(&server.stat_last_active_defrag_time); - } else if (server.stat_last_active_defrag_time != 0) { - server.stat_total_active_defrag_time += elapsedUs(server.stat_last_active_defrag_time); - server.stat_last_active_defrag_time = 0; + if (defrag.current_stage || listLength(defrag.remaining_stages) > 0) { + return computeDelayMs(endtime); + } else { + endDefragCycle(true); + return AE_NOMORE; // Ends the timer proc + } +} + + +/* During long running scripts, or while loading, there is a periodic function for handling other + * actions. This interface allows defrag to continue running, avoiding a single long defrag step + * after the long operation completes. */ +void defragWhileBlocked(void) { + if (!defragIsRunning()) return; + + // Save off the timeproc_id. If we have a normal termination, it will be cleared. + long long timeproc_id = defrag.timeproc_id; + + // Simulate a single call of the timer proc + long long reschedule_delay = activeDefragTimeProc(NULL, 0, NULL); + if (reschedule_delay == AE_NOMORE) { + // If it's done, deregister the timer + aeDeleteTimeEvent(server.el, timeproc_id); } + /* Otherwise, just ignore the reschedule_delay, the timer will pop the next time that the + * event loop can process timers again. */ +} + + +static void beginDefragCycle(void) { + serverAssert(!defragIsRunning()); + + serverAssert(defrag.remaining_stages == NULL); + defrag.remaining_stages = listCreate(); + + for (int dbid = 0; dbid < server.dbnum; dbid++) { + serverDb *db = &server.db[dbid]; + addDefragStage(defragStageDbKeys, db, NULL); + addDefragStage(defragStageExpiresKvstore, db->expires, NULL); + } + + static getClientChannelsFnWrapper getClientPubSubChannelsFn = {getClientPubSubChannels}; + static getClientChannelsFnWrapper getClientPubSubShardChannelsFn = {getClientPubSubShardChannels}; + addDefragStage(defragStagePubsubKvstore, server.pubsub_channels, &getClientPubSubChannelsFn); + addDefragStage(defragStagePubsubKvstore, server.pubsubshard_channels, &getClientPubSubShardChannelsFn); + + addDefragStage(defragLuaScripts, NULL, NULL); + addDefragStage(defragModuleGlobals, NULL, NULL); + + defrag.current_stage = NULL; + defrag.start_cycle = getMonotonicUs(); + defrag.start_defrag_hits = server.stat_active_defrag_hits; + defrag.timeproc_end_time = 0; + defrag.timeproc_overage_us = 0; + defrag.timeproc_id = aeCreateTimeEvent(server.el, 0, activeDefragTimeProc, NULL, NULL); + + elapsedStart(&server.stat_last_active_defrag_time); +} + + +#define INTERPOLATE(x, x1, x2, y1, y2) ((y1) + ((x) - (x1)) * ((y2) - (y1)) / ((x2) - (x1))) +#define LIMIT(y, min, max) ((y) < (min) ? min : ((y) > (max) ? max : (y))) + +/* decide if defrag is needed, and at what CPU effort to invest in it */ +static void updateDefragCpuPercent(void) { + size_t frag_bytes; + float frag_pct = getAllocatorFragmentation(&frag_bytes); + if (server.active_defrag_cpu_percent == 0) { + if (frag_pct < server.active_defrag_threshold_lower || + frag_bytes < server.active_defrag_ignore_bytes) return; + } + + /* Calculate the adaptive aggressiveness of the defrag based on the current + * fragmentation and configurations. */ + int cpu_pct = INTERPOLATE(frag_pct, server.active_defrag_threshold_lower, server.active_defrag_threshold_upper, + server.active_defrag_cpu_min, server.active_defrag_cpu_max); + cpu_pct = LIMIT(cpu_pct, server.active_defrag_cpu_min, server.active_defrag_cpu_max); + + /* Normally we allow increasing the aggressiveness during a scan, but don't + * reduce it, since we should not lower the aggressiveness when fragmentation + * drops. But when a configuration is made, we should reconsider it. */ + if (cpu_pct > server.active_defrag_cpu_percent || server.active_defrag_configuration_changed) { + server.active_defrag_configuration_changed = 0; + if (defragIsRunning()) { + serverLog(LL_VERBOSE, "Changing active defrag CPU, frag=%.0f%%, frag_bytes=%zu, cpu=%d%%", + frag_pct, frag_bytes, cpu_pct); + } else { + serverLog(LL_VERBOSE, "Starting active defrag, frag=%.0f%%, frag_bytes=%zu, cpu=%d%%", + frag_pct, frag_bytes, cpu_pct); + } + server.active_defrag_cpu_percent = cpu_pct; + } +} + + +void monitorActiveDefrag(void) { + if (!server.active_defrag_enabled) return; + + /* Defrag gets paused while a child process is active. So there's no point in starting a new + * cycle or adjusting the CPU percentage for an existing cycle. */ + if (hasActiveChildProcess()) return; + + updateDefragCpuPercent(); + + if (server.active_defrag_cpu_percent > 0 && !defragIsRunning()) beginDefragCycle(); } #else /* HAVE_DEFRAG */ -void activeDefragCycle(void) { +void monitorActiveDefrag(void) { /* Not implemented yet. */ } @@ -1150,4 +1396,7 @@ robj *activeDefragStringOb(robj *ob) { return NULL; } +void defragWhileBlocked(void) { +} + #endif diff --git a/src/dict.c b/src/dict.c index f164820584..f75369d533 100644 --- a/src/dict.c +++ b/src/dict.c @@ -576,7 +576,7 @@ dictEntry *dictAddRaw(dict *d, void *key, dictEntry **existing) { if (!position) return NULL; /* Dup the key if necessary. */ - if (d->type->keyDup) key = d->type->keyDup(d, key); + if (d->type->keyDup) key = d->type->keyDup(key); return dictInsertAtPosition(d, key, position); } @@ -640,7 +640,7 @@ int dictReplace(dict *d, void *key, void *val) { * reverse. */ void *oldval = dictGetVal(existing); dictSetVal(d, existing, val); - if (d->type->valDestructor) d->type->valDestructor(d, oldval); + if (d->type->valDestructor) d->type->valDestructor(oldval); return 0; } @@ -742,6 +742,18 @@ dictEntry *dictUnlink(dict *d, const void *key) { return dictGenericDelete(d, key, 1); } +inline static void dictFreeKey(dict *d, dictEntry *entry) { + if (d->type->keyDestructor) { + d->type->keyDestructor(dictGetKey(entry)); + } +} + +inline static void dictFreeVal(dict *d, dictEntry *entry) { + if (d->type->valDestructor) { + d->type->valDestructor(dictGetVal(entry)); + } +} + /* You need to call this function to really free the entry after a call * to dictUnlink(). It's safe to call this function with 'he' = NULL. */ void dictFreeUnlinkedEntry(dict *d, dictEntry *he) { @@ -919,7 +931,7 @@ void dictTwoPhaseUnlinkFree(dict *d, dictEntry *he, dictEntry **plink, int table : (entryIsEmbedded(de) ? &decodeEntryEmbedded(de)->field : (panic("Entry type not supported"), NULL))) void dictSetKey(dict *d, dictEntry *de, void *key) { - void *k = d->type->keyDup ? d->type->keyDup(d, key) : key; + void *k = d->type->keyDup ? d->type->keyDup(key) : key; if (entryIsNormal(de)) { dictEntryNormal *_de = decodeEntryNormal(de); _de->key = k; @@ -1309,7 +1321,7 @@ unsigned int dictGetSomeKeys(dict *d, dictEntry **des, unsigned int count) { /* Reallocate the dictEntry, key and value allocations in a bucket using the * provided allocation functions in order to defrag them. */ -static void dictDefragBucket(dictEntry **bucketref, dictDefragFunctions *defragfns, void *privdata) { +static void dictDefragBucket(dictEntry **bucketref, const dictDefragFunctions *defragfns, void *privdata) { dictDefragAllocFunction *defragalloc = defragfns->defragAlloc; dictDefragAllocFunction *defragkey = defragfns->defragKey; dictDefragAllocFunction *defragval = defragfns->defragVal; @@ -1487,7 +1499,7 @@ unsigned long dictScan(dict *d, unsigned long v, dictScanFunction *fn, void *pri * where NULL means that no reallocation happened and the old memory is still * valid. */ unsigned long -dictScanDefrag(dict *d, unsigned long v, dictScanFunction *fn, dictDefragFunctions *defragfns, void *privdata) { +dictScanDefrag(dict *d, unsigned long v, dictScanFunction *fn, const dictDefragFunctions *defragfns, void *privdata) { int htidx0, htidx1; const dictEntry *de, *next; unsigned long m0, m1; diff --git a/src/dict.h b/src/dict.h index 1c9e059baa..854d026cdc 100644 --- a/src/dict.h +++ b/src/dict.h @@ -53,10 +53,10 @@ typedef struct dict dict; typedef struct dictType { /* Callbacks */ uint64_t (*hashFunction)(const void *key); - void *(*keyDup)(dict *d, const void *key); - int (*keyCompare)(dict *d, const void *key1, const void *key2); - void (*keyDestructor)(dict *d, void *key); - void (*valDestructor)(dict *d, void *obj); + void *(*keyDup)(const void *key); + int (*keyCompare)(const void *key1, const void *key2); + void (*keyDestructor)(void *key); + void (*valDestructor)(void *obj); int (*resizeAllowed)(size_t moreMem, double usedRatio); /* Invoked at the start of dict initialization/rehashing (old and new ht are already created) */ void (*rehashingStarted)(dict *d); @@ -144,16 +144,13 @@ typedef struct { #define DICT_HT_INITIAL_SIZE (1 << (DICT_HT_INITIAL_EXP)) /* ------------------------------- Macros ------------------------------------*/ -#define dictFreeVal(d, entry) \ - do { \ - if ((d)->type->valDestructor) (d)->type->valDestructor((d), dictGetVal(entry)); \ - } while (0) - -#define dictFreeKey(d, entry) \ - if ((d)->type->keyDestructor) (d)->type->keyDestructor((d), dictGetKey(entry)) - -#define dictCompareKeys(d, key1, key2) \ - (((d)->type->keyCompare) ? (d)->type->keyCompare((d), key1, key2) : (key1) == (key2)) +static inline int dictCompareKeys(dict *d, const void *key1, const void *key2) { + if (d->type->keyCompare) { + return d->type->keyCompare(key1, key2); + } else { + return (key1 == key2); + } +} #define dictMetadata(d) (&(d)->metadata) #define dictMetadataSize(d) ((d)->type->dictMetadataBytes ? (d)->type->dictMetadataBytes(d) : 0) @@ -241,7 +238,7 @@ void dictSetHashFunctionSeed(uint8_t *seed); uint8_t *dictGetHashFunctionSeed(void); unsigned long dictScan(dict *d, unsigned long v, dictScanFunction *fn, void *privdata); unsigned long -dictScanDefrag(dict *d, unsigned long v, dictScanFunction *fn, dictDefragFunctions *defragfns, void *privdata); +dictScanDefrag(dict *d, unsigned long v, dictScanFunction *fn, const dictDefragFunctions *defragfns, void *privdata); uint64_t dictGetHash(dict *d, const void *key); void dictRehashingInfo(dict *d, unsigned long long *from_size, unsigned long long *to_size); diff --git a/src/eval.c b/src/eval.c index fd12e40ad2..a9c50cdf90 100644 --- a/src/eval.c +++ b/src/eval.c @@ -57,8 +57,7 @@ void evalGenericCommandWithDebugging(client *c, int evalsha); sds ldbCatStackValue(sds s, lua_State *lua, int idx); listNode *luaScriptsLRUAdd(client *c, sds sha, int evalsha); -static void dictLuaScriptDestructor(dict *d, void *val) { - UNUSED(d); +static void dictLuaScriptDestructor(void *val) { if (val == NULL) return; /* Lazy freeing will set value to NULL. */ decrRefCount(((luaScript *)val)->body); zfree(val); @@ -200,10 +199,12 @@ void scriptingInit(int setup) { } /* Initialize a dictionary we use to map SHAs to scripts. - * Initialize a list we use for lua script evictions, it shares the - * sha with the dictionary, so free fn is not set. */ + * Initialize a list we use for lua script evictions. + * Note that we duplicate the sha when adding to the lru list due to defrag, + * and we need to free them respectively. */ lctx.lua_scripts = dictCreate(&shaScriptObjectDictType); lctx.lua_scripts_lru_list = listCreate(); + listSetFreeMethod(lctx.lua_scripts_lru_list, (void (*)(void *))sdsfree); lctx.lua_scripts_mem = 0; luaRegisterServerAPI(lua); @@ -519,9 +520,6 @@ void luaDeleteFunction(client *c, sds sha) { dictEntry *de = dictUnlink(lctx.lua_scripts, sha); serverAssertWithInfo(c ? c : lctx.lua_client, NULL, de); luaScript *l = dictGetVal(de); - /* We only delete `EVAL` scripts, which must exist in the LRU list. */ - serverAssert(l->node); - listDelNode(lctx.lua_scripts_lru_list, l->node); lctx.lua_scripts_mem -= sdsAllocSize(sha) + getStringObjectSdsUsedMemory(l->body); dictFreeUnlinkedEntry(lctx.lua_scripts, de); } @@ -550,11 +548,12 @@ listNode *luaScriptsLRUAdd(client *c, sds sha, int evalsha) { listNode *ln = listFirst(lctx.lua_scripts_lru_list); sds oldest = listNodeValue(ln); luaDeleteFunction(c, oldest); + listDelNode(lctx.lua_scripts_lru_list, ln); server.stat_evictedscripts++; } /* Add current. */ - listAddNodeTail(lctx.lua_scripts_lru_list, sha); + listAddNodeTail(lctx.lua_scripts_lru_list, sdsdup(sha)); return listLast(lctx.lua_scripts_lru_list); } diff --git a/src/evict.c b/src/evict.c index 5e4b6220eb..5208328b32 100644 --- a/src/evict.c +++ b/src/evict.c @@ -546,8 +546,8 @@ int performEvictions(void) { goto update_metrics; } - if (server.maxmemory_policy == MAXMEMORY_NO_EVICTION) { - result = EVICT_FAIL; /* We need to free memory, but policy forbids. */ + if (server.maxmemory_policy == MAXMEMORY_NO_EVICTION || (iAmPrimary() && server.import_mode)) { + result = EVICT_FAIL; /* We need to free memory, but policy forbids or we are in import mode. */ goto update_metrics; } diff --git a/src/expire.c b/src/expire.c index 928bb58d86..c22df1ef86 100644 --- a/src/expire.c +++ b/src/expire.c @@ -520,8 +520,11 @@ int checkAlreadyExpired(long long when) { * of a replica instance. * * Instead we add the already expired key to the database with expire time - * (possibly in the past) and wait for an explicit DEL from the primary. */ - return (when <= commandTimeSnapshot() && !server.loading && !server.primary_host); + * (possibly in the past) and wait for an explicit DEL from the primary. + * + * If the server is a primary and in the import mode, we also add the already + * expired key and wait for an explicit DEL from the import source. */ + return (when <= commandTimeSnapshot() && !server.loading && !server.primary_host && !server.import_mode); } #define EXPIRE_NX (1 << 0) diff --git a/src/functions.c b/src/functions.c index a00fefb329..b694e35252 100644 --- a/src/functions.c +++ b/src/functions.c @@ -43,9 +43,9 @@ typedef enum { static size_t engine_cache_memory = 0; /* Forward declaration */ -static void engineFunctionDispose(dict *d, void *obj); -static void engineStatsDispose(dict *d, void *obj); -static void engineLibraryDispose(dict *d, void *obj); +static void engineFunctionDispose(void *obj); +static void engineStatsDispose(void *obj); +static void engineLibraryDispose(void *obj); static int functionsVerifyName(sds name); typedef struct functionsLibEngineStats { @@ -126,15 +126,13 @@ static size_t libraryMallocSize(functionLibInfo *li) { return zmalloc_size(li) + sdsAllocSize(li->name) + sdsAllocSize(li->code); } -static void engineStatsDispose(dict *d, void *obj) { - UNUSED(d); +static void engineStatsDispose(void *obj) { functionsLibEngineStats *stats = obj; zfree(stats); } /* Dispose function memory */ -static void engineFunctionDispose(dict *d, void *obj) { - UNUSED(d); +static void engineFunctionDispose(void *obj) { if (!obj) { return; } @@ -158,15 +156,14 @@ static void engineLibraryFree(functionLibInfo *li) { zfree(li); } -static void engineLibraryDispose(dict *d, void *obj) { - UNUSED(d); +static void engineLibraryDispose(void *obj) { engineLibraryFree(obj); } /* Clear all the functions from the given library ctx */ -void functionsLibCtxClear(functionsLibCtx *lib_ctx) { - dictEmpty(lib_ctx->functions, NULL); - dictEmpty(lib_ctx->libraries, NULL); +void functionsLibCtxClear(functionsLibCtx *lib_ctx, void(callback)(dict *)) { + dictEmpty(lib_ctx->functions, callback); + dictEmpty(lib_ctx->libraries, callback); dictIterator *iter = dictGetIterator(lib_ctx->engines_stats); dictEntry *entry = NULL; while ((entry = dictNext(iter))) { @@ -175,22 +172,31 @@ void functionsLibCtxClear(functionsLibCtx *lib_ctx) { stats->n_lib = 0; } dictReleaseIterator(iter); - curr_functions_lib_ctx->cache_memory = 0; + lib_ctx->cache_memory = 0; } -void functionsLibCtxClearCurrent(int async) { +void functionsLibCtxClearCurrent(int async, void(callback)(dict *)) { if (async) { functionsLibCtx *old_l_ctx = curr_functions_lib_ctx; curr_functions_lib_ctx = functionsLibCtxCreate(); freeFunctionsAsync(old_l_ctx); } else { - functionsLibCtxClear(curr_functions_lib_ctx); + functionsLibCtxClear(curr_functions_lib_ctx, callback); + } +} + +/* Free the given functions ctx */ +static void functionsLibCtxFreeGeneric(functionsLibCtx *functions_lib_ctx, int async) { + if (async) { + freeFunctionsAsync(functions_lib_ctx); + } else { + functionsLibCtxFree(functions_lib_ctx); } } /* Free the given functions ctx */ void functionsLibCtxFree(functionsLibCtx *functions_lib_ctx) { - functionsLibCtxClear(functions_lib_ctx); + functionsLibCtxClear(functions_lib_ctx, NULL); dictRelease(functions_lib_ctx->functions); dictRelease(functions_lib_ctx->libraries); dictRelease(functions_lib_ctx->engines_stats); @@ -199,8 +205,8 @@ void functionsLibCtxFree(functionsLibCtx *functions_lib_ctx) { /* Swap the current functions ctx with the given one. * Free the old functions ctx. */ -void functionsLibCtxSwapWithCurrent(functionsLibCtx *new_lib_ctx) { - functionsLibCtxFree(curr_functions_lib_ctx); +void functionsLibCtxSwapWithCurrent(functionsLibCtx *new_lib_ctx, int async) { + functionsLibCtxFreeGeneric(curr_functions_lib_ctx, async); curr_functions_lib_ctx = new_lib_ctx; } @@ -374,7 +380,7 @@ libraryJoin(functionsLibCtx *functions_lib_ctx_dst, functionsLibCtx *functions_l dictReleaseIterator(iter); iter = NULL; - functionsLibCtxClear(functions_lib_ctx_src); + functionsLibCtxClear(functions_lib_ctx_src, NULL); if (old_libraries_list) { listRelease(old_libraries_list); old_libraries_list = NULL; @@ -772,7 +778,7 @@ void functionRestoreCommand(client *c) { } if (restore_replicy == restorePolicy_Flush) { - functionsLibCtxSwapWithCurrent(functions_lib_ctx); + functionsLibCtxSwapWithCurrent(functions_lib_ctx, server.lazyfree_lazy_user_flush); functions_lib_ctx = NULL; /* avoid releasing the f_ctx in the end */ } else { if (libraryJoin(curr_functions_lib_ctx, functions_lib_ctx, restore_replicy == restorePolicy_Replace, &err) != @@ -792,7 +798,7 @@ void functionRestoreCommand(client *c) { addReply(c, shared.ok); } if (functions_lib_ctx) { - functionsLibCtxFree(functions_lib_ctx); + functionsLibCtxFreeGeneric(functions_lib_ctx, server.lazyfree_lazy_user_flush); } } @@ -814,7 +820,7 @@ void functionFlushCommand(client *c) { return; } - functionsLibCtxClearCurrent(async); + functionsLibCtxClearCurrent(async, NULL); /* Indicate that the command changed the data so it will be replicated and * counted as a data change (for persistence configuration) */ diff --git a/src/functions.h b/src/functions.h index da196cf197..b199fbd06e 100644 --- a/src/functions.h +++ b/src/functions.h @@ -133,10 +133,10 @@ dict *functionsLibGet(void); size_t functionsLibCtxFunctionsLen(functionsLibCtx *functions_ctx); functionsLibCtx *functionsLibCtxGetCurrent(void); functionsLibCtx *functionsLibCtxCreate(void); -void functionsLibCtxClearCurrent(int async); -void functionsLibCtxFree(functionsLibCtx *lib_ctx); -void functionsLibCtxClear(functionsLibCtx *lib_ctx); -void functionsLibCtxSwapWithCurrent(functionsLibCtx *lib_ctx); +void functionsLibCtxClearCurrent(int async, void(callback)(dict *)); +void functionsLibCtxFree(functionsLibCtx *functions_lib_ctx); +void functionsLibCtxClear(functionsLibCtx *lib_ctx, void(callback)(dict *)); +void functionsLibCtxSwapWithCurrent(functionsLibCtx *new_lib_ctx, int async); int functionLibCreateFunction(sds name, void *function, functionLibInfo *li, sds desc, uint64_t f_flags, sds *err); diff --git a/src/hyperloglog.c b/src/hyperloglog.c index 563c5e7941..9a48c821ab 100644 --- a/src/hyperloglog.c +++ b/src/hyperloglog.c @@ -35,6 +35,10 @@ #include #include +#ifdef HAVE_AVX2 +#include +#endif + /* The HyperLogLog implementation is based on the following ideas: * * * The use of a 64 bit hash function as proposed in [1], in order to estimate @@ -208,6 +212,13 @@ struct hllhdr { static char *invalid_hll_err = "-INVALIDOBJ Corrupted HLL object detected"; +#ifdef HAVE_AVX2 +static int simd_enabled = 1; +#define HLL_USE_AVX2 (simd_enabled && __builtin_cpu_supports("avx2")) +#else +#define HLL_USE_AVX2 0 +#endif + /* =========================== Low level bit macros ========================= */ /* Macros to access the dense representation. @@ -1064,6 +1075,136 @@ int hllAdd(robj *o, unsigned char *ele, size_t elesize) { } } +#ifdef HAVE_AVX2 +/* A specialized version of hllMergeDense, optimized for default configurations. + * + * Requirements: + * 1) HLL_REGISTERS == 16384 && HLL_BITS == 6 + * 2) The CPU supports AVX2 (checked at runtime in hllMergeDense) + * + * reg_raw: pointer to the raw representation array (16384 bytes, one byte per register) + * reg_dense: pointer to the dense representation array (12288 bytes, 6 bits per register) + */ +ATTRIBUTE_TARGET_AVX2 +void hllMergeDenseAVX2(uint8_t *reg_raw, const uint8_t *reg_dense) { + /* Shuffle indices for unpacking bytes of dense registers + * From: {XXXX|AAAB|BBCC|CDDD|EEEF|FFGG|GHHH|XXXX} + * To: {AAA0|BBB0|CCC0|DDD0|EEE0|FFF0|GGG0|HHH0} + */ + const __m256i shuffle = _mm256_setr_epi8( // + 4, 5, 6, -1, // + 7, 8, 9, -1, // + 10, 11, 12, -1, // + 13, 14, 15, -1, // + 0, 1, 2, -1, // + 3, 4, 5, -1, // + 6, 7, 8, -1, // + 9, 10, 11, -1 // + ); + + /* Merge the first 8 registers (6 bytes) normally + * as the AVX2 algorithm needs 4 padding bytes at the start */ + uint8_t val; + for (int i = 0; i < 8; i++) { + HLL_DENSE_GET_REGISTER(val, reg_dense, i); + if (val > reg_raw[i]) { + reg_raw[i] = val; + } + } + + /* Dense to Raw: + * + * 4 registers in 3 bytes: + * {bbaaaaaa|ccccbbbb|ddddddcc} + * + * LOAD 32 bytes (32 registers) per iteration: + * 4(padding) + 12(16 registers) + 12(16 registers) + 4(padding) + * {XXXX|AAAB|BBCC|CDDD|EEEF|FFGG|GHHH|XXXX} + * + * SHUFFLE to: + * {AAA0|BBB0|CCC0|DDD0|EEE0|FFF0|GGG0|HHH0} + * {bbaaaaaa|ccccbbbb|ddddddcc|00000000} x8 + * + * AVX2 is little endian, each of the 8 groups is a little-endian int32. + * A group (int32) contains 3 valid bytes (4 registers) and a zero byte. + * + * extract registers in each group with AND and SHIFT: + * {00aaaaaa|00000000|00000000|00000000} x8 (<<0) + * {00000000|00bbbbbb|00000000|00000000} x8 (<<2) + * {00000000|00000000|00cccccc|00000000} x8 (<<4) + * {00000000|00000000|00000000|00dddddd} x8 (<<6) + * + * merge the extracted registers with OR: + * {00aaaaaa|00bbbbbb|00cccccc|00dddddd} x8 + * + * Finally, compute MAX(reg_raw, merged) and STORE it back to reg_raw + */ + + /* Skip 8 registers (6 bytes) */ + const uint8_t *r = reg_dense + 6 - 4; + uint8_t *t = reg_raw + 8; + + for (int i = 0; i < HLL_REGISTERS / 32 - 1; ++i) { + __m256i x0, x; + x0 = _mm256_loadu_si256((__m256i *)r); + x = _mm256_shuffle_epi8(x0, shuffle); + + __m256i a1, a2, a3, a4; + a1 = _mm256_and_si256(x, _mm256_set1_epi32(0x0000003f)); + a2 = _mm256_and_si256(x, _mm256_set1_epi32(0x00000fc0)); + a3 = _mm256_and_si256(x, _mm256_set1_epi32(0x0003f000)); + a4 = _mm256_and_si256(x, _mm256_set1_epi32(0x00fc0000)); + + a2 = _mm256_slli_epi32(a2, 2); + a3 = _mm256_slli_epi32(a3, 4); + a4 = _mm256_slli_epi32(a4, 6); + + __m256i y1, y2, y; + y1 = _mm256_or_si256(a1, a2); + y2 = _mm256_or_si256(a3, a4); + y = _mm256_or_si256(y1, y2); + + __m256i z = _mm256_loadu_si256((__m256i *)t); + + z = _mm256_max_epu8(z, y); + + _mm256_storeu_si256((__m256i *)t, z); + + r += 24; + t += 32; + } + + /* Merge the last 24 registers normally + * as the AVX2 algorithm needs 4 padding bytes at the end */ + for (int i = HLL_REGISTERS - 24; i < HLL_REGISTERS; i++) { + HLL_DENSE_GET_REGISTER(val, reg_dense, i); + if (val > reg_raw[i]) { + reg_raw[i] = val; + } + } +} +#endif + +/* Merge dense-encoded registers to raw registers array. */ +void hllMergeDense(uint8_t *reg_raw, const uint8_t *reg_dense) { +#ifdef HAVE_AVX2 + if (HLL_REGISTERS == 16384 && HLL_BITS == 6) { + if (HLL_USE_AVX2) { + hllMergeDenseAVX2(reg_raw, reg_dense); + return; + } + } +#endif + + uint8_t val; + for (int i = 0; i < HLL_REGISTERS; i++) { + HLL_DENSE_GET_REGISTER(val, reg_dense, i); + if (val > reg_raw[i]) { + reg_raw[i] = val; + } + } +} + /* Merge by computing MAX(registers[i],hll[i]) the HyperLogLog 'hll' * with an array of uint8_t HLL_REGISTERS registers pointed by 'max'. * @@ -1077,12 +1218,7 @@ int hllMerge(uint8_t *max, robj *hll) { int i; if (hdr->encoding == HLL_DENSE) { - uint8_t val; - - for (i = 0; i < HLL_REGISTERS; i++) { - HLL_DENSE_GET_REGISTER(val, hdr->registers, i); - if (val > max[i]) max[i] = val; - } + hllMergeDense(max, hdr->registers); } else { uint8_t *p = hll->ptr, *end = p + sdslen(hll->ptr); long runlen, regval; @@ -1114,6 +1250,121 @@ int hllMerge(uint8_t *max, robj *hll) { return C_OK; } +#ifdef HAVE_AVX2 +/* A specialized version of hllDenseCompress, optimized for default configurations. + * + * Requirements: + * 1) HLL_REGISTERS == 16384 && HLL_BITS == 6 + * 2) The CPU supports AVX2 (checked at runtime in hllDenseCompress) + * + * reg_dense: pointer to the dense representation array (12288 bytes, 6 bits per register) + * reg_raw: pointer to the raw representation array (16384 bytes, one byte per register) + */ +ATTRIBUTE_TARGET_AVX2 +void hllDenseCompressAVX2(uint8_t *reg_dense, const uint8_t *reg_raw) { + /* Shuffle indices for packing bytes of dense registers + * From: {AAA0|BBB0|CCC0|DDD0|EEE0|FFF0|GGG0|HHH0} + * To: {AAAB|BBCC|CDDD|0000|EEEF|FFGG|GHHH|0000} + */ + const __m256i shuffle = _mm256_setr_epi8( // + 0, 1, 2, // + 4, 5, 6, // + 8, 9, 10, // + 12, 13, 14, // + -1, -1, -1, -1, // + 0, 1, 2, // + 4, 5, 6, // + 8, 9, 10, // + 12, 13, 14, // + -1, -1, -1, -1 // + ); + + /* Raw to Dense: + * + * LOAD 32 bytes (32 registers) per iteration: + * {00aaaaaa|00bbbbbb|00cccccc|00dddddd} x8 + * + * AVX2 is little endian, each of the 8 groups is a little-endian int32. + * A group (int32) contains 4 registers. + * + * move the registers to correct positions with AND and SHIFT: + * {00aaaaaa|00000000|00000000|00000000} x8 (>>0) + * {bb000000|0000bbbb|00000000|00000000} x8 (>>2) + * {00000000|cccc0000|000000cc|00000000} x8 (>>4) + * {00000000|00000000|dddddd00|00000000} x8 (>>6) + * + * merge the registers with OR: + * {bbaaaaaa|ccccbbbb|ddddddcc|00000000} x8 + * {AAA0|BBB0|CCC0|DDD0|EEE0|FFF0|GGG0|HHH0} + * + * SHUFFLE to: + * {AAAB|BBCC|CDDD|0000|EEEF|FFGG|GHHH|0000} + * + * STORE the lower half and higher half respectively: + * AAABBBCCCDDD0000 + * EEEFFFGGGHHH0000 + * AAABBBCCCDDDEEEFFFGGGHHH0000 + * + * Note that the last 4 bytes are padding bytes. + */ + + const uint8_t *r = reg_raw; + uint8_t *t = reg_dense; + + for (int i = 0; i < HLL_REGISTERS / 32 - 1; ++i) { + __m256i x = _mm256_loadu_si256((__m256i *)r); + + __m256i a1, a2, a3, a4; + a1 = _mm256_and_si256(x, _mm256_set1_epi32(0x0000003f)); + a2 = _mm256_and_si256(x, _mm256_set1_epi32(0x00003f00)); + a3 = _mm256_and_si256(x, _mm256_set1_epi32(0x003f0000)); + a4 = _mm256_and_si256(x, _mm256_set1_epi32(0x3f000000)); + + a2 = _mm256_srli_epi32(a2, 2); + a3 = _mm256_srli_epi32(a3, 4); + a4 = _mm256_srli_epi32(a4, 6); + + __m256i y1, y2, y; + y1 = _mm256_or_si256(a1, a2); + y2 = _mm256_or_si256(a3, a4); + y = _mm256_or_si256(y1, y2); + y = _mm256_shuffle_epi8(y, shuffle); + + __m128i lower, higher; + lower = _mm256_castsi256_si128(y); + higher = _mm256_extracti128_si256(y, 1); + + _mm_storeu_si128((__m128i *)t, lower); + _mm_storeu_si128((__m128i *)(t + 12), higher); + + r += 32; + t += 24; + } + + /* Merge the last 32 registers normally + * as the AVX2 algorithm needs 4 padding bytes at the end */ + for (int i = HLL_REGISTERS - 32; i < HLL_REGISTERS; i++) { + HLL_DENSE_SET_REGISTER(reg_dense, i, reg_raw[i]); + } +} +#endif + +/* Compress raw registers to dense representation. */ +void hllDenseCompress(uint8_t *reg_dense, const uint8_t *reg_raw) { +#ifdef HAVE_AVX2 + if (HLL_REGISTERS == 16384 && HLL_BITS == 6) { + if (HLL_USE_AVX2) { + hllDenseCompressAVX2(reg_dense, reg_raw); + return; + } + } +#endif + + for (int i = 0; i < HLL_REGISTERS; i++) { + HLL_DENSE_SET_REGISTER(reg_dense, i, reg_raw[i]); + } +} + /* ========================== HyperLogLog commands ========================== */ /* Create an HLL object. We always create the HLL using sparse encoding. @@ -1363,12 +1614,17 @@ void pfmergeCommand(client *c) { /* Write the resulting HLL to the destination HLL registers and * invalidate the cached value. */ - for (j = 0; j < HLL_REGISTERS; j++) { - if (max[j] == 0) continue; + if (use_dense) { hdr = o->ptr; - switch (hdr->encoding) { - case HLL_DENSE: hllDenseSet(hdr->registers, j, max[j]); break; - case HLL_SPARSE: hllSparseSet(o, j, max[j]); break; + hllDenseCompress(hdr->registers, max); + } else { + for (j = 0; j < HLL_REGISTERS; j++) { + if (max[j] == 0) continue; + hdr = o->ptr; + switch (hdr->encoding) { + case HLL_DENSE: hllDenseSet(hdr->registers, j, max[j]); break; + case HLL_SPARSE: hllSparseSet(o, j, max[j]); break; + } } } hdr = o->ptr; /* o->ptr may be different now, as a side effect of @@ -1494,6 +1750,7 @@ void pfselftestCommand(client *c) { * PFDEBUG DECODE * PFDEBUG ENCODING * PFDEBUG TODENSE + * PFDEBUG SIMD (ON|OFF) */ void pfdebugCommand(client *c) { char *cmd = c->argv[1]->ptr; @@ -1501,6 +1758,30 @@ void pfdebugCommand(client *c) { robj *o; int j; + if (!strcasecmp(cmd, "simd")) { + if (c->argc != 3) goto arityerr; + + if (!strcasecmp(c->argv[2]->ptr, "on")) { +#ifdef HAVE_AVX2 + simd_enabled = 1; +#endif + } else if (!strcasecmp(c->argv[2]->ptr, "off")) { +#ifdef HAVE_AVX2 + simd_enabled = 0; +#endif + } else { + addReplyError(c, "Argument must be ON or OFF"); + } + + if (HLL_USE_AVX2) { + addReplyStatus(c, "enabled"); + } else { + addReplyStatus(c, "disabled"); + } + + return; + } + o = lookupKeyWrite(c->db, c->argv[2]); if (o == NULL) { addReplyError(c, "The specified key does not exist"); diff --git a/src/io_threads.c b/src/io_threads.c index f4471b96d0..1ebd748bc2 100644 --- a/src/io_threads.c +++ b/src/io_threads.c @@ -441,8 +441,8 @@ void IOThreadFreeArgv(void *data) { /* This function attempts to offload the client's argv to an IO thread. * Returns C_OK if the client's argv were successfully offloaded to an IO thread, * C_ERR otherwise. */ -int tryOffloadFreeArgvToIOThreads(client *c) { - if (server.active_io_threads_num <= 1 || c->argc == 0) { +int tryOffloadFreeArgvToIOThreads(client *c, int argc, robj **argv) { + if (server.active_io_threads_num <= 1 || argc == 0) { return C_ERR; } @@ -456,11 +456,11 @@ int tryOffloadFreeArgvToIOThreads(client *c) { int last_arg_to_free = -1; /* Prepare the argv */ - for (int j = 0; j < c->argc; j++) { - if (c->argv[j]->refcount > 1) { - decrRefCount(c->argv[j]); + for (int j = 0; j < argc; j++) { + if (argv[j]->refcount > 1) { + decrRefCount(argv[j]); /* Set argv[j] to NULL to avoid double free */ - c->argv[j] = NULL; + argv[j] = NULL; } else { last_arg_to_free = j; } @@ -468,17 +468,17 @@ int tryOffloadFreeArgvToIOThreads(client *c) { /* If no argv to free, free the argv array at the main thread */ if (last_arg_to_free == -1) { - zfree(c->argv); + zfree(argv); return C_OK; } /* We set the refcount of the last arg to free to 0 to indicate that * this is the last argument to free. With this approach, we don't need to * send the argc to the IO thread and we can send just the argv ptr. */ - c->argv[last_arg_to_free]->refcount = 0; + argv[last_arg_to_free]->refcount = 0; /* Must succeed as we checked the free space before. */ - IOJobQueue_push(jq, IOThreadFreeArgv, c->argv); + IOJobQueue_push(jq, IOThreadFreeArgv, argv); return C_OK; } diff --git a/src/io_threads.h b/src/io_threads.h index f9a9cf762f..8818f08588 100644 --- a/src/io_threads.h +++ b/src/io_threads.h @@ -9,7 +9,7 @@ int inMainThread(void); int trySendReadToIOThreads(client *c); int trySendWriteToIOThreads(client *c); int tryOffloadFreeObjToIOThreads(robj *o); -int tryOffloadFreeArgvToIOThreads(client *c); +int tryOffloadFreeArgvToIOThreads(client *c, int argc, robj **argv); void adjustIOThreadsByEventLoad(int numevents, int increase_only); void drainIOThreadsQueue(void); void trySendPollJobToIOThreads(void); diff --git a/src/kvstore.c b/src/kvstore.c index e92af03784..344a8af5cf 100644 --- a/src/kvstore.c +++ b/src/kvstore.c @@ -54,8 +54,8 @@ struct _kvstore { int flags; dictType *dtype; dict **dicts; - long long num_dicts; - long long num_dicts_bits; + int num_dicts; + int num_dicts_bits; list *rehashing; /* List of dictionaries in this kvstore that are currently rehashing. */ int resize_cursor; /* Cron job uses this cursor to gradually resize dictionaries (only used if num_dicts > 1). */ int allocated_dicts; /* The number of allocated dicts. */ @@ -423,9 +423,11 @@ unsigned long long kvstoreScan(kvstore *kvs, * `dictTryExpand` call and in case of `dictExpand` call it signifies no expansion was performed. */ int kvstoreExpand(kvstore *kvs, uint64_t newsize, int try_expand, kvstoreExpandShouldSkipDictIndex *skip_cb) { + if (newsize == 0) return 1; for (int i = 0; i < kvs->num_dicts; i++) { - dict *d = kvstoreGetDict(kvs, i); - if (!d || (skip_cb && skip_cb(i))) continue; + if (skip_cb && skip_cb(i)) continue; + /* If the dictionary doesn't exist, create it */ + dict *d = createDictIfNeeded(kvs, i); int result = try_expand ? dictTryExpand(d, newsize) : dictExpand(d, newsize); if (try_expand && result == DICT_ERR) return 0; } @@ -737,7 +739,7 @@ unsigned long kvstoreDictScanDefrag(kvstore *kvs, int didx, unsigned long v, dictScanFunction *fn, - dictDefragFunctions *defragfns, + const dictDefragFunctions *defragfns, void *privdata) { dict *d = kvstoreGetDict(kvs, didx); if (!d) return 0; @@ -748,14 +750,27 @@ unsigned long kvstoreDictScanDefrag(kvstore *kvs, * within dict, it only reallocates the memory used by the dict structure itself using * the provided allocation function. This feature was added for the active defrag feature. * - * The 'defragfn' callback is called with a reference to the dict - * that callback can reallocate. */ -void kvstoreDictLUTDefrag(kvstore *kvs, kvstoreDictLUTDefragFunction *defragfn) { - for (int didx = 0; didx < kvs->num_dicts; didx++) { + * With 16k dictionaries for cluster mode with 1 shard, this operation may require substantial time + * to execute. A "cursor" is used to perform the operation iteratively. When first called, a + * cursor value of 0 should be provided. The return value is an updated cursor which should be + * provided on the next iteration. The operation is complete when 0 is returned. + * + * The 'defragfn' callback is called with a reference to the dict that callback can reallocate. */ +unsigned long kvstoreDictLUTDefrag(kvstore *kvs, unsigned long cursor, kvstoreDictLUTDefragFunction *defragfn) { + for (int didx = cursor; didx < kvs->num_dicts; didx++) { dict **d = kvstoreGetDictRef(kvs, didx), *newd; if (!*d) continue; + + listNode *rehashing_node = NULL; + if (listLength(kvs->rehashing) > 0) { + rehashing_node = ((kvstoreDictMetadata *)dictMetadata(*d))->rehashing_node; + } + if ((newd = defragfn(*d))) *d = newd; + if (rehashing_node) listNodeValue(rehashing_node) = *d; + return (didx + 1); } + return 0; } uint64_t kvstoreGetHash(kvstore *kvs, const void *key) { diff --git a/src/kvstore.h b/src/kvstore.h index 81a0d9a96e..00ec472e73 100644 --- a/src/kvstore.h +++ b/src/kvstore.h @@ -68,10 +68,10 @@ unsigned long kvstoreDictScanDefrag(kvstore *kvs, int didx, unsigned long v, dictScanFunction *fn, - dictDefragFunctions *defragfns, + const dictDefragFunctions *defragfns, void *privdata); typedef dict *(kvstoreDictLUTDefragFunction)(dict *d); -void kvstoreDictLUTDefrag(kvstore *kvs, kvstoreDictLUTDefragFunction *defragfn); +unsigned long kvstoreDictLUTDefrag(kvstore *kvs, unsigned long cursor, kvstoreDictLUTDefragFunction *defragfn); void *kvstoreDictFetchValue(kvstore *kvs, int didx, const void *key); dictEntry *kvstoreDictFind(kvstore *kvs, int didx, void *key); dictEntry *kvstoreDictAddRaw(kvstore *kvs, int didx, void *key, dictEntry **existing); diff --git a/src/latency.c b/src/latency.c index eef1532d03..783f04b197 100644 --- a/src/latency.c +++ b/src/latency.c @@ -37,8 +37,7 @@ #include "hdr_histogram.h" /* Dictionary type for latency events. */ -int dictStringKeyCompare(dict *d, const void *key1, const void *key2) { - UNUSED(d); +int dictStringKeyCompare(const void *key1, const void *key2) { return strcmp(key1, key2) == 0; } diff --git a/src/module.c b/src/module.c index 2884239200..5f9dff0402 100644 --- a/src/module.c +++ b/src/module.c @@ -681,6 +681,7 @@ void moduleReleaseTempClient(client *c) { c->bufpos = 0; c->raw_flag = 0; c->flag.module = 1; + c->flag.fake = 1; c->user = NULL; /* Root user */ c->cmd = c->lastcmd = c->realcmd = c->io_parsed_cmd = NULL; if (c->bstate.async_rm_call_handle) { @@ -2254,6 +2255,27 @@ int moduleIsModuleCommand(void *module_handle, struct serverCommand *cmd) { return (cp->module == module_handle); } +/* ValkeyModule_UpdateRuntimeArgs can be used to update the module argument values. + * The function parameter 'argc' indicates the number of updated arguments, and 'argv' + * represents the values of the updated arguments. + * Once 'CONFIG REWRITE' command is called, the updated argument values can be saved into conf file. + * + * The function always returns VALKEYMODULE_OK. */ +int VM_UpdateRuntimeArgs(ValkeyModuleCtx *ctx, ValkeyModuleString **argv, int argc) { + struct moduleLoadQueueEntry *loadmod = ctx->module->loadmod; + for (int i = 0; i < loadmod->argc; i++) { + decrRefCount(loadmod->argv[i]); + } + zfree(loadmod->argv); + loadmod->argv = argc - 1 ? zmalloc(sizeof(robj *) * (argc - 1)) : NULL; + loadmod->argc = argc - 1; + for (int i = 1; i < argc; i++) { + loadmod->argv[i - 1] = argv[i]; + incrRefCount(loadmod->argv[i - 1]); + } + return VALKEYMODULE_OK; +} + /* -------------------------------------------------------------------------- * ## Module information and time measurement * -------------------------------------------------------------------------- */ @@ -11814,8 +11836,7 @@ uint64_t dictCStringKeyHash(const void *key) { return dictGenHashFunction((unsigned char *)key, strlen((char *)key)); } -int dictCStringKeyCompare(dict *d, const void *key1, const void *key2) { - UNUSED(d); +int dictCStringKeyCompare(const void *key1, const void *key2) { return strcmp(key1, key2) == 0; } @@ -13344,7 +13365,7 @@ const char *VM_GetCurrentCommandName(ValkeyModuleCtx *ctx) { * defrag callback. */ struct ValkeyModuleDefragCtx { - long long int endtime; + monotime endtime; unsigned long *cursor; struct serverObject *key; /* Optional name of key processed, NULL when unknown. */ int dbid; /* The dbid of the key being processed, -1 when unknown. */ @@ -13373,7 +13394,7 @@ int VM_RegisterDefragFunc(ValkeyModuleCtx *ctx, ValkeyModuleDefragFunc cb) { * so it generally makes sense to do small batches of work in between calls. */ int VM_DefragShouldStop(ValkeyModuleDefragCtx *ctx) { - return (ctx->endtime != 0 && ctx->endtime < ustime()); + return (ctx->endtime != 0 && ctx->endtime <= getMonotonicUs()); } /* Store an arbitrary cursor value for future re-use. @@ -13455,7 +13476,7 @@ ValkeyModuleString *VM_DefragValkeyModuleString(ValkeyModuleDefragCtx *ctx, Valk * Returns a zero value (and initializes the cursor) if no more needs to be done, * or a non-zero value otherwise. */ -int moduleLateDefrag(robj *key, robj *value, unsigned long *cursor, long long endtime, int dbid) { +int moduleLateDefrag(robj *key, robj *value, unsigned long *cursor, monotime endtime, int dbid) { moduleValue *mv = value->ptr; moduleType *mt = mv->type; @@ -13560,6 +13581,7 @@ void moduleRegisterCoreAPI(void) { REGISTER_API(SetModuleAttribs); REGISTER_API(IsModuleNameBusy); REGISTER_API(WrongArity); + REGISTER_API(UpdateRuntimeArgs); REGISTER_API(ReplyWithLongLong); REGISTER_API(ReplyWithError); REGISTER_API(ReplyWithErrorFormat); diff --git a/src/modules/CMakeLists.txt b/src/modules/CMakeLists.txt new file mode 100644 index 0000000000..958796232f --- /dev/null +++ b/src/modules/CMakeLists.txt @@ -0,0 +1,21 @@ +# Build modules +list(APPEND MODULES_LIST "helloacl") +list(APPEND MODULES_LIST "helloblock") +list(APPEND MODULES_LIST "hellocluster") +list(APPEND MODULES_LIST "hellodict") +list(APPEND MODULES_LIST "hellohook") +list(APPEND MODULES_LIST "hellotimer") +list(APPEND MODULES_LIST "hellotype") +list(APPEND MODULES_LIST "helloworld") + +foreach (MODULE_NAME ${MODULES_LIST}) + message(STATUS "Building module: ${MODULE_NAME}") + add_library(${MODULE_NAME} SHARED "${CMAKE_CURRENT_LIST_DIR}/${MODULE_NAME}.c") + target_include_directories(${MODULE_NAME} PRIVATE "${CMAKE_SOURCE_DIR}/src") + set_target_properties(${MODULE_NAME} PROPERTIES PREFIX "") + valkey_install_bin(${MODULE_NAME}) + if (APPLE) + # Some symbols can only be resolved during runtime (they exist in the executable) + target_link_options(${MODULE_NAME} PRIVATE -undefined dynamic_lookup) + endif () +endforeach () diff --git a/src/modules/hellodict.c b/src/modules/hellodict.c index e0af06ba2f..db2fd17e8a 100644 --- a/src/modules/hellodict.c +++ b/src/modules/hellodict.c @@ -109,13 +109,13 @@ int ValkeyModule_OnLoad(ValkeyModuleCtx *ctx, ValkeyModuleString **argv, int arg if (ValkeyModule_Init(ctx, "hellodict", 1, VALKEYMODULE_APIVER_1) == VALKEYMODULE_ERR) return VALKEYMODULE_ERR; - if (ValkeyModule_CreateCommand(ctx, "hellodict.set", cmd_SET, "write deny-oom", 1, 1, 0) == VALKEYMODULE_ERR) + if (ValkeyModule_CreateCommand(ctx, "hellodict.set", cmd_SET, "write deny-oom", 1, 1, 1) == VALKEYMODULE_ERR) return VALKEYMODULE_ERR; - if (ValkeyModule_CreateCommand(ctx, "hellodict.get", cmd_GET, "readonly", 1, 1, 0) == VALKEYMODULE_ERR) + if (ValkeyModule_CreateCommand(ctx, "hellodict.get", cmd_GET, "readonly", 1, 1, 1) == VALKEYMODULE_ERR) return VALKEYMODULE_ERR; - if (ValkeyModule_CreateCommand(ctx, "hellodict.keyrange", cmd_KEYRANGE, "readonly", 1, 1, 0) == VALKEYMODULE_ERR) + if (ValkeyModule_CreateCommand(ctx, "hellodict.keyrange", cmd_KEYRANGE, "readonly", 1, 1, 1) == VALKEYMODULE_ERR) return VALKEYMODULE_ERR; /* Create our global dictionary. Here we'll set our keys and values. */ diff --git a/src/multi.c b/src/multi.c index bcffb90912..9e1f019244 100644 --- a/src/multi.c +++ b/src/multi.c @@ -238,6 +238,10 @@ void execCommand(client *c) { c->mstate.commands[j].argv = c->argv; c->mstate.commands[j].argv_len = c->argv_len; c->mstate.commands[j].cmd = c->cmd; + + /* The original argv has already been processed for slowlog and monitor, + * so we can safely free it before proceeding to the next command. */ + freeClientOriginalArgv(c); } // restore old DENY_BLOCKING value diff --git a/src/networking.c b/src/networking.c index 6751f5c7b8..debd94ddfc 100644 --- a/src/networking.c +++ b/src/networking.c @@ -314,7 +314,11 @@ int prepareClientToWrite(client *c) { * is set. */ if (c->flag.primary && !c->flag.primary_force_reply) return C_ERR; - if (!c->conn) return C_ERR; /* Fake client for AOF loading. */ + /* Skip the fake client, such as the fake client for AOF loading. + * But CLIENT_ID_CACHED_RESPONSE is allowed since it is a fake client + * but has a connection to cache the response. */ + if (c->flag.fake && c->id != CLIENT_ID_CACHED_RESPONSE) return C_ERR; + serverAssert(c->conn); /* Schedule the client to write the output buffers to the socket, unless * it should already be setup to do so (it has already pending data). */ @@ -348,6 +352,9 @@ sds aggregateClientOutputBuffer(client *c) { * It needs be paired with `deleteCachedResponseClient` function to stop caching. */ client *createCachedResponseClient(int resp) { struct client *recording_client = createClient(NULL); + /* It is a fake client but with a connection, setting a special client id, + * so we can identify it's a fake cached response client. */ + recording_client->id = CLIENT_ID_CACHED_RESPONSE; recording_client->resp = resp; /* Allocating the `conn` allows to prepare the caching client before adding * data to the clients output buffer by `prepareClientToWrite`. */ @@ -888,8 +895,11 @@ void setDeferredAggregateLen(client *c, void *node, long length, char prefix) { } char lenstr[128]; - size_t lenstr_len = snprintf(lenstr, sizeof(lenstr), "%c%ld\r\n", prefix, length); - setDeferredReply(c, node, lenstr, lenstr_len); + lenstr[0] = prefix; + size_t lenstr_len = ll2string(lenstr + 1, sizeof(lenstr) - 1, length); + lenstr[lenstr_len + 1] = '\r'; + lenstr[lenstr_len + 2] = '\n'; + setDeferredReply(c, node, lenstr, lenstr_len + 3); } void setDeferredArrayLen(client *c, void *node, long length) { @@ -1478,14 +1488,19 @@ void freeClientOriginalArgv(client *c) { /* We didn't rewrite this client */ if (!c->original_argv) return; - for (int j = 0; j < c->original_argc; j++) decrRefCount(c->original_argv[j]); - zfree(c->original_argv); + if (tryOffloadFreeArgvToIOThreads(c, c->original_argc, c->original_argv) == C_ERR) { + for (int j = 0; j < c->original_argc; j++) decrRefCount(c->original_argv[j]); + zfree(c->original_argv); + } + c->original_argv = NULL; c->original_argc = 0; } void freeClientArgv(client *c) { - if (tryOffloadFreeArgvToIOThreads(c) == C_ERR) { + /* If original_argv exists, 'c->argv' was allocated by the main thread, + * so it's more efficient to free it directly here rather than offloading to IO threads */ + if (c->original_argv || tryOffloadFreeArgvToIOThreads(c, c->argc, c->argv) == C_ERR) { for (int j = 0; j < c->argc; j++) decrRefCount(c->argv[j]); zfree(c->argv); } @@ -1552,12 +1567,17 @@ void unlinkClient(client *c) { * in which case it needs to be cleaned from that list */ if (c->flag.replica && c->repl_state == REPLICA_STATE_WAIT_BGSAVE_END && server.rdb_pipe_conns) { int i; + int still_alive = 0; for (i = 0; i < server.rdb_pipe_numconns; i++) { if (server.rdb_pipe_conns[i] == c->conn) { rdbPipeWriteHandlerConnRemoved(c->conn); server.rdb_pipe_conns[i] = NULL; - break; } + if (server.rdb_pipe_conns[i]) still_alive++; + } + if (still_alive == 0) { + serverLog(LL_NOTICE, "Diskless rdb transfer, last replica dropped, killing fork child."); + killRDBChild(); } } /* Only use shutdown when the fork is active and we are the parent. */ @@ -1705,10 +1725,10 @@ void freeClient(client *c) { /* Log link disconnection with replica */ if (getClientType(c) == CLIENT_TYPE_REPLICA) { - serverLog(LL_NOTICE, - c->flag.repl_rdb_channel ? "Replica %s rdb channel disconnected." - : "Connection with replica %s lost.", - replicationGetReplicaName(c)); + if (c->flag.repl_rdb_channel) + dualChannelServerLog(LL_NOTICE, "Replica %s rdb channel disconnected.", replicationGetReplicaName(c)); + else + serverLog(LL_NOTICE, "Connection with replica %s lost.", replicationGetReplicaName(c)); } /* Free the query buffer */ @@ -1728,6 +1748,7 @@ void freeClient(client *c) { /* UNWATCH all the keys */ unwatchAllKeys(c); listRelease(c->watched_keys); + c->watched_keys = NULL; /* Unsubscribe from all the pubsub channels */ pubsubUnsubscribeAllChannels(c, 0); @@ -1735,16 +1756,22 @@ void freeClient(client *c) { pubsubUnsubscribeAllPatterns(c, 0); unmarkClientAsPubSub(c); dictRelease(c->pubsub_channels); + c->pubsub_channels = NULL; dictRelease(c->pubsub_patterns); + c->pubsub_patterns = NULL; dictRelease(c->pubsubshard_channels); + c->pubsubshard_channels = NULL; /* Free data structures. */ listRelease(c->reply); - zfree(c->buf); + c->reply = NULL; + zfree_with_size(c->buf, c->buf_usable_size); + c->buf = NULL; freeReplicaReferencedReplBuffer(c); freeClientArgv(c); freeClientOriginalArgv(c); if (c->deferred_reply_errors) listRelease(c->deferred_reply_errors); + c->deferred_reply_errors = NULL; #ifdef LOG_REQ_RES reqresReset(c, 1); #endif @@ -1771,6 +1798,7 @@ void freeClient(client *c) { if (server.saveparamslen == 0 && c->repl_state == REPLICA_STATE_WAIT_BGSAVE_END && server.child_type == CHILD_TYPE_RDB && server.rdb_child_type == RDB_CHILD_TYPE_DISK && anyOtherReplicaWaitRdb(c) == 0) { + serverLog(LL_NOTICE, "Background saving, persistence disabled, last replica dropped, killing fork child."); killRDBChild(); } if (c->repl_state == REPLICA_STATE_SEND_BULK) { @@ -1947,14 +1975,15 @@ int freeClientsInAsyncFreeQueue(void) { if (!c->rdb_client_disconnect_time) { if (c->conn) connSetReadHandler(c->conn, NULL); c->rdb_client_disconnect_time = server.unixtime; - serverLog(LL_VERBOSE, "Postpone RDB client id=%llu (%s) free for %d seconds", (unsigned long long)c->id, - replicationGetReplicaName(c), server.wait_before_rdb_client_free); + dualChannelServerLog(LL_VERBOSE, "Postpone RDB client id=%llu (%s) free for %d seconds", + (unsigned long long)c->id, replicationGetReplicaName(c), server.wait_before_rdb_client_free); } if (server.unixtime - c->rdb_client_disconnect_time <= server.wait_before_rdb_client_free) continue; - serverLog(LL_NOTICE, - "Replica main channel failed to establish PSYNC within the grace period (%ld seconds). " - "Freeing RDB client %llu.", - (long int)(server.unixtime - c->rdb_client_disconnect_time), (unsigned long long)c->id); + dualChannelServerLog( + LL_NOTICE, + "Replica main channel failed to establish PSYNC within the grace period (%ld seconds). " + "Freeing RDB client %llu.", + (long int)(server.unixtime - c->rdb_client_disconnect_time), (unsigned long long)c->id); c->flag.protected_rdb_channel = 0; } @@ -2521,6 +2550,7 @@ void resetClient(client *c) { serverCommandProc *prevcmd = c->cmd ? c->cmd->proc : NULL; freeClientArgv(c); + freeClientOriginalArgv(c); c->cur_script = NULL; c->reqtype = 0; c->multibulklen = 0; @@ -3369,6 +3399,29 @@ sds catClientInfoString(sds s, client *client, int hide_user_data) { return ret; } +/* Concatenate a string representing the state of a client in a human + * readable format, into the sds string 's'. + * + * This is a simplified and shortened version of catClientInfoString, + * it only added some basic fields for tracking clients. */ +sds catClientInfoShortString(sds s, client *client, int hide_user_data) { + if (!server.crashed) waitForClientIO(client); + char conninfo[CONN_INFO_LEN]; + + sds ret = sdscatfmt( + s, + FMTARGS( + "id=%U", (unsigned long long)client->id, + " addr=%s", getClientPeerId(client), + " laddr=%s", getClientSockname(client), + " %s", connGetInfo(client->conn, conninfo, sizeof(conninfo)), + " name=%s", hide_user_data ? "*redacted*" : (client->name ? (char *)client->name->ptr : ""), + " user=%s", hide_user_data ? "*redacted*" : (client->user ? client->user->name : "(superuser)"), + " lib-name=%s", client->lib_name ? (char *)client->lib_name->ptr : "", + " lib-ver=%s", client->lib_ver ? (char *)client->lib_ver->ptr : "")); + return ret; +} + sds getAllClientsInfoString(int type, int hide_user_data) { listNode *ln; listIter li; @@ -3569,6 +3622,10 @@ void clientCommand(client *c) { " Protect current client connection from eviction.", "NO-TOUCH (ON|OFF)", " Will not touch LRU/LFU stats when this mode is on.", + "IMPORT-SOURCE (ON|OFF)", + " Mark this connection as an import source if import-mode is enabled.", + " Sync tools can set their connections into 'import-source' state to visit", + " expired keys.", NULL}; addReplyHelp(c, help); } else if (!strcasecmp(c->argv[1]->ptr, "id") && c->argc == 2) { @@ -4042,6 +4099,22 @@ void clientCommand(client *c) { } } addReply(c, shared.ok); + } else if (!strcasecmp(c->argv[1]->ptr, "import-source")) { + /* CLIENT IMPORT-SOURCE ON|OFF */ + if (!server.import_mode) { + addReplyError(c, "Server is not in import mode"); + return; + } + if (!strcasecmp(c->argv[2]->ptr, "on")) { + c->flag.import_source = 1; + addReply(c, shared.ok); + } else if (!strcasecmp(c->argv[2]->ptr, "off")) { + c->flag.import_source = 0; + addReply(c, shared.ok); + } else { + addReplyErrorObject(c, shared.syntaxerr); + return; + } } else { addReplySubcommandSyntaxError(c); } @@ -4181,16 +4254,53 @@ void securityWarningCommand(client *c) { freeClientAsync(c); } -/* Keep track of the original command arguments so that we can generate - * an accurate slowlog entry after the command has been executed. */ -static void retainOriginalCommandVector(client *c) { - /* We already rewrote this command, so don't rewrite it again */ - if (c->original_argv) return; - c->original_argc = c->argc; - c->original_argv = zmalloc(sizeof(robj *) * (c->argc)); - for (int j = 0; j < c->argc; j++) { - c->original_argv[j] = c->argv[j]; - incrRefCount(c->argv[j]); +/* This function preserves the original command arguments for accurate slowlog recording. + * + * It performs the following operations: + * - Stores the initial command vector if not already saved + * - Manages memory allocation for command argument modifications + * + * new_argc - The new number of arguments to allocate space for if necessary. + * new_argv - Optional pointer to a new argument vector. If NULL, space will be + * allocated for new_argc arguments, preserving the existing arguments. + */ +static void backupAndUpdateClientArgv(client *c, int new_argc, robj **new_argv) { + robj **old_argv = c->argv; + int old_argc = c->argc; + + /* Store original arguments if not already saved */ + if (!c->original_argv) { + c->original_argc = old_argc; + c->original_argv = old_argv; + } + + /* Handle direct argv replacement */ + if (new_argv) { + c->argv = new_argv; + } else if (c->original_argv == old_argv || new_argc > old_argc) { + /* Allocate new array if necessary */ + c->argv = zmalloc(sizeof(robj *) * new_argc); + + for (int i = 0; i < old_argc && i < new_argc; i++) { + c->argv[i] = old_argv[i]; + incrRefCount(c->argv[i]); + } + + /* Initialize new argument slots to NULL */ + for (int i = old_argc; i < new_argc; i++) { + c->argv[i] = NULL; + } + } + + c->argc = new_argc; + c->argv_len = new_argc; + + /* Clean up old argv if necessary */ + if (c->argv != old_argv && c->original_argv != old_argv) { + for (int i = 0; i < old_argc; i++) { + if (old_argv[i]) decrRefCount(old_argv[i]); + } + zfree(old_argv); } } @@ -4198,7 +4308,7 @@ static void retainOriginalCommandVector(client *c) { * in the slowlog. This information is stored in the * original_argv array. */ void redactClientCommandArgument(client *c, int argc) { - retainOriginalCommandVector(c); + backupAndUpdateClientArgv(c, c->argc, NULL); if (c->original_argv[argc] == shared.redacted) { /* This argument has already been redacted */ return; @@ -4231,10 +4341,7 @@ void rewriteClientCommandVector(client *c, int argc, ...) { /* Completely replace the client command vector with the provided one. */ void replaceClientCommandVector(client *c, int argc, robj **argv) { int j; - retainOriginalCommandVector(c); - freeClientArgv(c); - c->argv = argv; - c->argc = argc; + backupAndUpdateClientArgv(c, argc, argv); c->argv_len_sum = 0; for (j = 0; j < c->argc; j++) if (c->argv[j]) c->argv_len_sum += getStringObjectLen(c->argv[j]); @@ -4255,19 +4362,9 @@ void replaceClientCommandVector(client *c, int argc, robj **argv) { * free the no longer used objects on c->argv. */ void rewriteClientCommandArgument(client *c, int i, robj *newval) { robj *oldval; - retainOriginalCommandVector(c); + int new_argc = (i >= c->argc) ? i + 1 : c->argc; + backupAndUpdateClientArgv(c, new_argc, NULL); - /* We need to handle both extending beyond argc (just update it and - * initialize the new element) or beyond argv_len (realloc is needed). - */ - if (i >= c->argc) { - if (i >= c->argv_len) { - c->argv = zrealloc(c->argv, sizeof(robj *) * (i + 1)); - c->argv_len = i + 1; - } - c->argc = i + 1; - c->argv[i] = NULL; - } oldval = c->argv[i]; if (oldval) c->argv_len_sum -= getStringObjectLen(oldval); if (newval) c->argv_len_sum += getStringObjectLen(newval); @@ -4439,7 +4536,8 @@ int checkClientOutputBufferLimits(client *c) { * * Returns 1 if client was (flagged) closed. */ int closeClientOnOutputBufferLimitReached(client *c, int async) { - if (!c->conn) return 0; /* It is unsafe to free fake clients. */ + if (c->flag.fake) return 0; /* It is unsafe to free fake clients. */ + serverAssert(c->conn); serverAssert(c->reply_bytes < SIZE_MAX - (1024 * 64)); /* Note that c->reply_bytes is irrelevant for replica clients * (they use the global repl buffers). */ @@ -4555,7 +4653,7 @@ static void pauseClientsByClient(mstime_t endTime, int isPauseClientAll) { } /* Pause actions up to the specified unixtime (in ms) for a given type of - * commands. + * purpose. * * A main use case of this function is to allow pausing replication traffic * so that a failover without data loss to occur. Replicas will continue to receive diff --git a/src/object.c b/src/object.c index 8c1cf64892..035198ad89 100644 --- a/src/object.c +++ b/src/object.c @@ -398,9 +398,14 @@ void decrRefCount(robj *o) { } } -/* See dismissObject() */ +/* See dismissObject(). sds is an exception, because the allocation + * size is known. Instead of dismissing it with madvise(MADV_DONTNEED) + * we free it via the allocator, which has minimal overhead when the + * size is known. This has advantage that it allows the allocator to + * accumulate free buffers to free whole pages, while madvise is nop + * if the buffer is less than a page. */ void dismissSds(sds s) { - dismissMemory(sdsAllocPtr(s), sdsAllocSize(s)); + sdsfree(s); } /* See dismissObject() */ diff --git a/src/quicklist.c b/src/quicklist.c index 617d21cd8c..225fac6fdf 100644 --- a/src/quicklist.c +++ b/src/quicklist.c @@ -210,9 +210,7 @@ void quicklistRelease(quicklist *quicklist) { * Returns 1 if listpack compressed successfully. * Returns 0 if compression failed or if listpack too small to compress. */ static int __quicklistCompressNode(quicklistNode *node) { -#ifdef SERVER_TEST node->attempted_compress = 1; -#endif if (node->dont_compress) return 0; /* validate that the node is neither @@ -250,9 +248,7 @@ static int __quicklistCompressNode(quicklistNode *node) { /* Uncompress the listpack in 'node' and update encoding details. * Returns 1 on successful decode, 0 on failure to decode. */ static int __quicklistDecompressNode(quicklistNode *node) { -#ifdef SERVER_TEST node->attempted_compress = 0; -#endif node->recompress = 0; void *decompressed = zmalloc(node->sz); @@ -1692,1419 +1688,3 @@ void quicklistBookmarksClear(quicklist *ql) { /* NOTE: We do not shrink (realloc) the quick list. main use case for this * function is just before releasing the allocation. */ } - -/* The rest of this file is test cases and test helpers. */ -#ifdef SERVER_TEST -#include -#include -#include "testhelp.h" -#include - -#define yell(str, ...) printf("ERROR! " str "\n\n", __VA_ARGS__) - -#define ERROR \ - do { \ - printf("\tERROR!\n"); \ - err++; \ - } while (0) - -#define ERR(x, ...) \ - do { \ - printf("%s:%s:%d:\t", __FILE__, __func__, __LINE__); \ - printf("ERROR! " x "\n", __VA_ARGS__); \ - err++; \ - } while (0) - -#define TEST(name) printf("test — %s\n", name); -#define TEST_DESC(name, ...) printf("test — " name "\n", __VA_ARGS__); - -#define QL_TEST_VERBOSE 0 - -#define UNUSED(x) (void)(x) -static void ql_info(quicklist *ql) { -#if QL_TEST_VERBOSE - printf("Container length: %lu\n", ql->len); - printf("Container size: %lu\n", ql->count); - if (ql->head) printf("\t(zsize head: %lu)\n", lpLength(ql->head->entry)); - if (ql->tail) printf("\t(zsize tail: %lu)\n", lpLength(ql->tail->entry)); - printf("\n"); -#else - UNUSED(ql); -#endif -} - -/* Return the UNIX time in microseconds */ -static long long ustime(void) { - struct timeval tv; - long long ust; - - gettimeofday(&tv, NULL); - ust = ((long long)tv.tv_sec) * 1000000; - ust += tv.tv_usec; - return ust; -} - -/* Return the UNIX time in milliseconds */ -static long long mstime(void) { - return ustime() / 1000; -} - -/* Iterate over an entire quicklist. - * Print the list if 'print' == 1. - * - * Returns physical count of elements found by iterating over the list. */ -static int _itrprintr(quicklist *ql, int print, int forward) { - quicklistIter *iter = quicklistGetIterator(ql, forward ? AL_START_HEAD : AL_START_TAIL); - quicklistEntry entry; - int i = 0; - int p = 0; - quicklistNode *prev = NULL; - while (quicklistNext(iter, &entry)) { - if (entry.node != prev) { - /* Count the number of list nodes too */ - p++; - prev = entry.node; - } - if (print) { - int size = (entry.sz > (1 << 20)) ? 1 << 20 : entry.sz; - printf("[%3d (%2d)]: [%.*s] (%lld)\n", i, p, size, (char *)entry.value, entry.longval); - } - i++; - } - quicklistReleaseIterator(iter); - return i; -} -static int itrprintr(quicklist *ql, int print) { - return _itrprintr(ql, print, 1); -} - -static int itrprintr_rev(quicklist *ql, int print) { - return _itrprintr(ql, print, 0); -} - -#define ql_verify(a, b, c, d, e) \ - do { \ - err += _ql_verify((a), (b), (c), (d), (e)); \ - } while (0) - -static int _ql_verify_compress(quicklist *ql) { - int errors = 0; - if (quicklistAllowsCompression(ql)) { - quicklistNode *node = ql->head; - unsigned int low_raw = ql->compress; - unsigned int high_raw = ql->len - ql->compress; - - for (unsigned int at = 0; at < ql->len; at++, node = node->next) { - if (node && (at < low_raw || at >= high_raw)) { - if (node->encoding != QUICKLIST_NODE_ENCODING_RAW) { - yell("Incorrect compression: node %d is " - "compressed at depth %d ((%u, %u); total " - "nodes: %lu; size: %zu; recompress: %d)", - at, ql->compress, low_raw, high_raw, ql->len, node->sz, node->recompress); - errors++; - } - } else { - if (node->encoding != QUICKLIST_NODE_ENCODING_LZF && !node->attempted_compress) { - yell("Incorrect non-compression: node %d is NOT " - "compressed at depth %d ((%u, %u); total " - "nodes: %lu; size: %zu; recompress: %d; attempted: %d)", - at, ql->compress, low_raw, high_raw, ql->len, node->sz, node->recompress, - node->attempted_compress); - errors++; - } - } - } - } - return errors; -} - -/* Verify list metadata matches physical list contents. */ -static int _ql_verify(quicklist *ql, uint32_t len, uint32_t count, uint32_t head_count, uint32_t tail_count) { - int errors = 0; - - ql_info(ql); - if (len != ql->len) { - yell("quicklist length wrong: expected %d, got %lu", len, ql->len); - errors++; - } - - if (count != ql->count) { - yell("quicklist count wrong: expected %d, got %lu", count, ql->count); - errors++; - } - - int loopr = itrprintr(ql, 0); - if (loopr != (int)ql->count) { - yell("quicklist cached count not match actual count: expected %lu, got " - "%d", - ql->count, loopr); - errors++; - } - - int rloopr = itrprintr_rev(ql, 0); - if (loopr != rloopr) { - yell("quicklist has different forward count than reverse count! " - "Forward count is %d, reverse count is %d.", - loopr, rloopr); - errors++; - } - - if (ql->len == 0 && !errors) { - return errors; - } - - if (ql->head && head_count != ql->head->count && head_count != lpLength(ql->head->entry)) { - yell("quicklist head count wrong: expected %d, " - "got cached %d vs. actual %lu", - head_count, ql->head->count, lpLength(ql->head->entry)); - errors++; - } - - if (ql->tail && tail_count != ql->tail->count && tail_count != lpLength(ql->tail->entry)) { - yell("quicklist tail count wrong: expected %d, " - "got cached %u vs. actual %lu", - tail_count, ql->tail->count, lpLength(ql->tail->entry)); - errors++; - } - - errors += _ql_verify_compress(ql); - return errors; -} - -/* Release iterator and verify compress correctly. */ -static void ql_release_iterator(quicklistIter *iter) { - quicklist *ql = NULL; - if (iter) ql = iter->quicklist; - quicklistReleaseIterator(iter); - if (ql) assert(!_ql_verify_compress(ql)); -} - -/* Generate new string concatenating integer i against string 'prefix' */ -static char *genstr(char *prefix, int i) { - static char result[64] = {0}; - snprintf(result, sizeof(result), "%s%d", prefix, i); - return result; -} - -static void randstring(unsigned char *target, size_t sz) { - size_t p = 0; - int minval, maxval; - switch (rand() % 3) { - case 0: - minval = 'a'; - maxval = 'z'; - break; - case 1: - minval = '0'; - maxval = '9'; - break; - case 2: - minval = 'A'; - maxval = 'Z'; - break; - default: assert(NULL); - } - - while (p < sz) target[p++] = minval + rand() % (maxval - minval + 1); -} - -/* main test, but callable from other files */ -int quicklistTest(int argc, char *argv[], int flags) { - UNUSED(argc); - UNUSED(argv); - - int accurate = (flags & TEST_ACCURATE); - unsigned int err = 0; - int optimize_start = -(int)(sizeof(optimization_level) / sizeof(*optimization_level)); - - printf("Starting optimization offset at: %d\n", optimize_start); - - int options[] = {0, 1, 2, 3, 4, 5, 6, 10}; - int fills[] = {-5, -4, -3, -2, -1, 0, 1, 2, 32, 66, 128, 999}; - size_t option_count = sizeof(options) / sizeof(*options); - int fill_count = (int)(sizeof(fills) / sizeof(*fills)); - long long runtime[option_count]; - - for (int _i = 0; _i < (int)option_count; _i++) { - printf("Testing Compression option %d\n", options[_i]); - long long start = mstime(); - quicklistIter *iter; - - TEST("create list") { - quicklist *ql = quicklistNew(-2, options[_i]); - ql_verify(ql, 0, 0, 0, 0); - quicklistRelease(ql); - } - - TEST("add to tail of empty list") { - quicklist *ql = quicklistNew(-2, options[_i]); - quicklistPushTail(ql, "hello", 6); - /* 1 for head and 1 for tail because 1 node = head = tail */ - ql_verify(ql, 1, 1, 1, 1); - quicklistRelease(ql); - } - - TEST("add to head of empty list") { - quicklist *ql = quicklistNew(-2, options[_i]); - quicklistPushHead(ql, "hello", 6); - /* 1 for head and 1 for tail because 1 node = head = tail */ - ql_verify(ql, 1, 1, 1, 1); - quicklistRelease(ql); - } - - TEST_DESC("add to tail 5x at compress %d", options[_i]) { - for (int f = 0; f < fill_count; f++) { - quicklist *ql = quicklistNew(fills[f], options[_i]); - for (int i = 0; i < 5; i++) quicklistPushTail(ql, genstr("hello", i), 32); - if (ql->count != 5) ERROR; - if (fills[f] == 32) ql_verify(ql, 1, 5, 5, 5); - quicklistRelease(ql); - } - } - - TEST_DESC("add to head 5x at compress %d", options[_i]) { - for (int f = 0; f < fill_count; f++) { - quicklist *ql = quicklistNew(fills[f], options[_i]); - for (int i = 0; i < 5; i++) quicklistPushHead(ql, genstr("hello", i), 32); - if (ql->count != 5) ERROR; - if (fills[f] == 32) ql_verify(ql, 1, 5, 5, 5); - quicklistRelease(ql); - } - } - - TEST_DESC("add to tail 500x at compress %d", options[_i]) { - for (int f = 0; f < fill_count; f++) { - quicklist *ql = quicklistNew(fills[f], options[_i]); - for (int i = 0; i < 500; i++) quicklistPushTail(ql, genstr("hello", i), 64); - if (ql->count != 500) ERROR; - if (fills[f] == 32) ql_verify(ql, 16, 500, 32, 20); - quicklistRelease(ql); - } - } - - TEST_DESC("add to head 500x at compress %d", options[_i]) { - for (int f = 0; f < fill_count; f++) { - quicklist *ql = quicklistNew(fills[f], options[_i]); - for (int i = 0; i < 500; i++) quicklistPushHead(ql, genstr("hello", i), 32); - if (ql->count != 500) ERROR; - if (fills[f] == 32) ql_verify(ql, 16, 500, 20, 32); - quicklistRelease(ql); - } - } - - TEST("rotate empty") { - quicklist *ql = quicklistNew(-2, options[_i]); - quicklistRotate(ql); - ql_verify(ql, 0, 0, 0, 0); - quicklistRelease(ql); - } - - TEST("Comprassion Plain node") { - for (int f = 0; f < fill_count; f++) { - size_t large_limit = (fills[f] < 0) ? quicklistNodeNegFillLimit(fills[f]) + 1 : SIZE_SAFETY_LIMIT + 1; - - char buf[large_limit]; - quicklist *ql = quicklistNew(fills[f], 1); - for (int i = 0; i < 500; i++) { - /* Set to 256 to allow the node to be triggered to compress, - * if it is less than 48(nocompress), the test will be successful. */ - snprintf(buf, sizeof(buf), "hello%d", i); - quicklistPushHead(ql, buf, large_limit); - } - - quicklistIter *iter = quicklistGetIterator(ql, AL_START_TAIL); - quicklistEntry entry; - int i = 0; - while (quicklistNext(iter, &entry)) { - assert(QL_NODE_IS_PLAIN(entry.node)); - snprintf(buf, sizeof(buf), "hello%d", i); - if (strcmp((char *)entry.value, buf)) - ERR("value [%s] didn't match [%s] at position %d", entry.value, buf, i); - i++; - } - ql_release_iterator(iter); - quicklistRelease(ql); - } - } - - TEST("NEXT plain node") { - for (int f = 0; f < fill_count; f++) { - size_t large_limit = (fills[f] < 0) ? quicklistNodeNegFillLimit(fills[f]) + 1 : SIZE_SAFETY_LIMIT + 1; - quicklist *ql = quicklistNew(fills[f], options[_i]); - - char buf[large_limit]; - memcpy(buf, "plain", 5); - quicklistPushHead(ql, buf, large_limit); - quicklistPushHead(ql, buf, large_limit); - quicklistPushHead(ql, "packed3", 7); - quicklistPushHead(ql, "packed4", 7); - quicklistPushHead(ql, buf, large_limit); - - quicklistEntry entry; - quicklistIter *iter = quicklistGetIterator(ql, AL_START_TAIL); - - while (quicklistNext(iter, &entry) != 0) { - if (QL_NODE_IS_PLAIN(entry.node)) - assert(!memcmp(entry.value, "plain", 5)); - else - assert(!memcmp(entry.value, "packed", 6)); - } - ql_release_iterator(iter); - quicklistRelease(ql); - } - } - - TEST("rotate plain node ") { - for (int f = 0; f < fill_count; f++) { - size_t large_limit = (fills[f] < 0) ? quicklistNodeNegFillLimit(fills[f]) + 1 : SIZE_SAFETY_LIMIT + 1; - - unsigned char *data = NULL; - size_t sz; - long long lv; - int i = 0; - quicklist *ql = quicklistNew(fills[f], options[_i]); - char buf[large_limit]; - memcpy(buf, "hello1", 6); - quicklistPushHead(ql, buf, large_limit); - memcpy(buf, "hello4", 6); - quicklistPushHead(ql, buf, large_limit); - memcpy(buf, "hello3", 6); - quicklistPushHead(ql, buf, large_limit); - memcpy(buf, "hello2", 6); - quicklistPushHead(ql, buf, large_limit); - quicklistRotate(ql); - - for (i = 1; i < 5; i++) { - assert(QL_NODE_IS_PLAIN(ql->tail)); - quicklistPop(ql, QUICKLIST_HEAD, &data, &sz, &lv); - int temp_char = data[5]; - zfree(data); - assert(temp_char == ('0' + i)); - } - - ql_verify(ql, 0, 0, 0, 0); - quicklistRelease(ql); - } - } - - TEST("rotate one val once") { - for (int f = 0; f < fill_count; f++) { - quicklist *ql = quicklistNew(fills[f], options[_i]); - quicklistPushHead(ql, "hello", 6); - quicklistRotate(ql); - /* Ignore compression verify because listpack is - * too small to compress. */ - ql_verify(ql, 1, 1, 1, 1); - quicklistRelease(ql); - } - } - - TEST_DESC("rotate 500 val 5000 times at compress %d", options[_i]) { - for (int f = 0; f < fill_count; f++) { - quicklist *ql = quicklistNew(fills[f], options[_i]); - quicklistPushHead(ql, "900", 3); - quicklistPushHead(ql, "7000", 4); - quicklistPushHead(ql, "-1200", 5); - quicklistPushHead(ql, "42", 2); - for (int i = 0; i < 500; i++) quicklistPushHead(ql, genstr("hello", i), 64); - ql_info(ql); - for (int i = 0; i < 5000; i++) { - ql_info(ql); - quicklistRotate(ql); - } - if (fills[f] == 1) - ql_verify(ql, 504, 504, 1, 1); - else if (fills[f] == 2) - ql_verify(ql, 252, 504, 2, 2); - else if (fills[f] == 32) - ql_verify(ql, 16, 504, 32, 24); - quicklistRelease(ql); - } - } - - TEST("pop empty") { - quicklist *ql = quicklistNew(-2, options[_i]); - quicklistPop(ql, QUICKLIST_HEAD, NULL, NULL, NULL); - ql_verify(ql, 0, 0, 0, 0); - quicklistRelease(ql); - } - - TEST("pop 1 string from 1") { - quicklist *ql = quicklistNew(-2, options[_i]); - char *populate = genstr("hello", 331); - quicklistPushHead(ql, populate, 32); - unsigned char *data; - size_t sz; - long long lv; - ql_info(ql); - assert(quicklistPop(ql, QUICKLIST_HEAD, &data, &sz, &lv)); - assert(data != NULL); - assert(sz == 32); - if (strcmp(populate, (char *)data)) { - int size = sz; - ERR("Pop'd value (%.*s) didn't equal original value (%s)", size, data, populate); - } - zfree(data); - ql_verify(ql, 0, 0, 0, 0); - quicklistRelease(ql); - } - - TEST("pop head 1 number from 1") { - quicklist *ql = quicklistNew(-2, options[_i]); - quicklistPushHead(ql, "55513", 5); - unsigned char *data; - size_t sz; - long long lv; - ql_info(ql); - assert(quicklistPop(ql, QUICKLIST_HEAD, &data, &sz, &lv)); - assert(data == NULL); - assert(lv == 55513); - ql_verify(ql, 0, 0, 0, 0); - quicklistRelease(ql); - } - - TEST("pop head 500 from 500") { - quicklist *ql = quicklistNew(-2, options[_i]); - for (int i = 0; i < 500; i++) quicklistPushHead(ql, genstr("hello", i), 32); - ql_info(ql); - for (int i = 0; i < 500; i++) { - unsigned char *data; - size_t sz; - long long lv; - int ret = quicklistPop(ql, QUICKLIST_HEAD, &data, &sz, &lv); - assert(ret == 1); - assert(data != NULL); - assert(sz == 32); - if (strcmp(genstr("hello", 499 - i), (char *)data)) { - int size = sz; - ERR("Pop'd value (%.*s) didn't equal original value (%s)", size, data, genstr("hello", 499 - i)); - } - zfree(data); - } - ql_verify(ql, 0, 0, 0, 0); - quicklistRelease(ql); - } - - TEST("pop head 5000 from 500") { - quicklist *ql = quicklistNew(-2, options[_i]); - for (int i = 0; i < 500; i++) quicklistPushHead(ql, genstr("hello", i), 32); - for (int i = 0; i < 5000; i++) { - unsigned char *data; - size_t sz; - long long lv; - int ret = quicklistPop(ql, QUICKLIST_HEAD, &data, &sz, &lv); - if (i < 500) { - assert(ret == 1); - assert(data != NULL); - assert(sz == 32); - if (strcmp(genstr("hello", 499 - i), (char *)data)) { - int size = sz; - ERR("Pop'd value (%.*s) didn't equal original value " - "(%s)", - size, data, genstr("hello", 499 - i)); - } - zfree(data); - } else { - assert(ret == 0); - } - } - ql_verify(ql, 0, 0, 0, 0); - quicklistRelease(ql); - } - - TEST("iterate forward over 500 list") { - quicklist *ql = quicklistNew(-2, options[_i]); - quicklistSetFill(ql, 32); - for (int i = 0; i < 500; i++) quicklistPushHead(ql, genstr("hello", i), 32); - quicklistIter *iter = quicklistGetIterator(ql, AL_START_HEAD); - quicklistEntry entry; - int i = 499, count = 0; - while (quicklistNext(iter, &entry)) { - char *h = genstr("hello", i); - if (strcmp((char *)entry.value, h)) - ERR("value [%s] didn't match [%s] at position %d", entry.value, h, i); - i--; - count++; - } - if (count != 500) ERR("Didn't iterate over exactly 500 elements (%d)", i); - ql_verify(ql, 16, 500, 20, 32); - ql_release_iterator(iter); - quicklistRelease(ql); - } - - TEST("iterate reverse over 500 list") { - quicklist *ql = quicklistNew(-2, options[_i]); - quicklistSetFill(ql, 32); - for (int i = 0; i < 500; i++) quicklistPushHead(ql, genstr("hello", i), 32); - quicklistIter *iter = quicklistGetIterator(ql, AL_START_TAIL); - quicklistEntry entry; - int i = 0; - while (quicklistNext(iter, &entry)) { - char *h = genstr("hello", i); - if (strcmp((char *)entry.value, h)) - ERR("value [%s] didn't match [%s] at position %d", entry.value, h, i); - i++; - } - if (i != 500) ERR("Didn't iterate over exactly 500 elements (%d)", i); - ql_verify(ql, 16, 500, 20, 32); - ql_release_iterator(iter); - quicklistRelease(ql); - } - - TEST("insert after 1 element") { - quicklist *ql = quicklistNew(-2, options[_i]); - quicklistPushHead(ql, "hello", 6); - quicklistEntry entry; - iter = quicklistGetIteratorEntryAtIdx(ql, 0, &entry); - quicklistInsertAfter(iter, &entry, "abc", 4); - ql_release_iterator(iter); - ql_verify(ql, 1, 2, 2, 2); - - /* verify results */ - iter = quicklistGetIteratorEntryAtIdx(ql, 0, &entry); - int sz = entry.sz; - if (strncmp((char *)entry.value, "hello", 5)) { - ERR("Value 0 didn't match, instead got: %.*s", sz, entry.value); - } - ql_release_iterator(iter); - - iter = quicklistGetIteratorEntryAtIdx(ql, 1, &entry); - sz = entry.sz; - if (strncmp((char *)entry.value, "abc", 3)) { - ERR("Value 1 didn't match, instead got: %.*s", sz, entry.value); - } - ql_release_iterator(iter); - quicklistRelease(ql); - } - - TEST("insert before 1 element") { - quicklist *ql = quicklistNew(-2, options[_i]); - quicklistPushHead(ql, "hello", 6); - quicklistEntry entry; - iter = quicklistGetIteratorEntryAtIdx(ql, 0, &entry); - quicklistInsertBefore(iter, &entry, "abc", 4); - ql_release_iterator(iter); - ql_verify(ql, 1, 2, 2, 2); - - /* verify results */ - iter = quicklistGetIteratorEntryAtIdx(ql, 0, &entry); - int sz = entry.sz; - if (strncmp((char *)entry.value, "abc", 3)) { - ERR("Value 0 didn't match, instead got: %.*s", sz, entry.value); - } - ql_release_iterator(iter); - - iter = quicklistGetIteratorEntryAtIdx(ql, 1, &entry); - sz = entry.sz; - if (strncmp((char *)entry.value, "hello", 5)) { - ERR("Value 1 didn't match, instead got: %.*s", sz, entry.value); - } - ql_release_iterator(iter); - quicklistRelease(ql); - } - - TEST("insert head while head node is full") { - quicklist *ql = quicklistNew(4, options[_i]); - for (int i = 0; i < 10; i++) quicklistPushTail(ql, genstr("hello", i), 6); - quicklistSetFill(ql, -1); - quicklistEntry entry; - iter = quicklistGetIteratorEntryAtIdx(ql, -10, &entry); - char buf[4096] = {0}; - quicklistInsertBefore(iter, &entry, buf, 4096); - ql_release_iterator(iter); - ql_verify(ql, 4, 11, 1, 2); - quicklistRelease(ql); - } - - TEST("insert tail while tail node is full") { - quicklist *ql = quicklistNew(4, options[_i]); - for (int i = 0; i < 10; i++) quicklistPushHead(ql, genstr("hello", i), 6); - quicklistSetFill(ql, -1); - quicklistEntry entry; - iter = quicklistGetIteratorEntryAtIdx(ql, -1, &entry); - char buf[4096] = {0}; - quicklistInsertAfter(iter, &entry, buf, 4096); - ql_release_iterator(iter); - ql_verify(ql, 4, 11, 2, 1); - quicklistRelease(ql); - } - - TEST_DESC("insert once in elements while iterating at compress %d", options[_i]) { - for (int f = 0; f < fill_count; f++) { - quicklist *ql = quicklistNew(fills[f], options[_i]); - quicklistPushTail(ql, "abc", 3); - quicklistSetFill(ql, 1); - quicklistPushTail(ql, "def", 3); /* force to unique node */ - quicklistSetFill(ql, f); - quicklistPushTail(ql, "bob", 3); /* force to reset for +3 */ - quicklistPushTail(ql, "foo", 3); - quicklistPushTail(ql, "zoo", 3); - - itrprintr(ql, 0); - /* insert "bar" before "bob" while iterating over list. */ - quicklistIter *iter = quicklistGetIterator(ql, AL_START_HEAD); - quicklistEntry entry; - while (quicklistNext(iter, &entry)) { - if (!strncmp((char *)entry.value, "bob", 3)) { - /* Insert as fill = 1 so it spills into new node. */ - quicklistInsertBefore(iter, &entry, "bar", 3); - break; /* didn't we fix insert-while-iterating? */ - } - } - ql_release_iterator(iter); - itrprintr(ql, 0); - - /* verify results */ - iter = quicklistGetIteratorEntryAtIdx(ql, 0, &entry); - int sz = entry.sz; - - if (strncmp((char *)entry.value, "abc", 3)) - ERR("Value 0 didn't match, instead got: %.*s", sz, entry.value); - ql_release_iterator(iter); - - iter = quicklistGetIteratorEntryAtIdx(ql, 1, &entry); - if (strncmp((char *)entry.value, "def", 3)) - ERR("Value 1 didn't match, instead got: %.*s", sz, entry.value); - ql_release_iterator(iter); - - iter = quicklistGetIteratorEntryAtIdx(ql, 2, &entry); - if (strncmp((char *)entry.value, "bar", 3)) - ERR("Value 2 didn't match, instead got: %.*s", sz, entry.value); - ql_release_iterator(iter); - - iter = quicklistGetIteratorEntryAtIdx(ql, 3, &entry); - if (strncmp((char *)entry.value, "bob", 3)) - ERR("Value 3 didn't match, instead got: %.*s", sz, entry.value); - ql_release_iterator(iter); - - iter = quicklistGetIteratorEntryAtIdx(ql, 4, &entry); - if (strncmp((char *)entry.value, "foo", 3)) - ERR("Value 4 didn't match, instead got: %.*s", sz, entry.value); - ql_release_iterator(iter); - - iter = quicklistGetIteratorEntryAtIdx(ql, 5, &entry); - if (strncmp((char *)entry.value, "zoo", 3)) - ERR("Value 5 didn't match, instead got: %.*s", sz, entry.value); - ql_release_iterator(iter); - quicklistRelease(ql); - } - } - - TEST_DESC("insert [before] 250 new in middle of 500 elements at compress %d", options[_i]) { - for (int f = 0; f < fill_count; f++) { - quicklist *ql = quicklistNew(fills[f], options[_i]); - for (int i = 0; i < 500; i++) quicklistPushTail(ql, genstr("hello", i), 32); - for (int i = 0; i < 250; i++) { - quicklistEntry entry; - iter = quicklistGetIteratorEntryAtIdx(ql, 250, &entry); - quicklistInsertBefore(iter, &entry, genstr("abc", i), 32); - ql_release_iterator(iter); - } - if (fills[f] == 32) ql_verify(ql, 25, 750, 32, 20); - quicklistRelease(ql); - } - } - - TEST_DESC("insert [after] 250 new in middle of 500 elements at compress %d", options[_i]) { - for (int f = 0; f < fill_count; f++) { - quicklist *ql = quicklistNew(fills[f], options[_i]); - for (int i = 0; i < 500; i++) quicklistPushHead(ql, genstr("hello", i), 32); - for (int i = 0; i < 250; i++) { - quicklistEntry entry; - iter = quicklistGetIteratorEntryAtIdx(ql, 250, &entry); - quicklistInsertAfter(iter, &entry, genstr("abc", i), 32); - ql_release_iterator(iter); - } - - if (ql->count != 750) ERR("List size not 750, but rather %ld", ql->count); - - if (fills[f] == 32) ql_verify(ql, 26, 750, 20, 32); - quicklistRelease(ql); - } - } - - TEST("duplicate empty list") { - quicklist *ql = quicklistNew(-2, options[_i]); - ql_verify(ql, 0, 0, 0, 0); - quicklist *copy = quicklistDup(ql); - ql_verify(copy, 0, 0, 0, 0); - quicklistRelease(ql); - quicklistRelease(copy); - } - - TEST("duplicate list of 1 element") { - quicklist *ql = quicklistNew(-2, options[_i]); - quicklistPushHead(ql, genstr("hello", 3), 32); - ql_verify(ql, 1, 1, 1, 1); - quicklist *copy = quicklistDup(ql); - ql_verify(copy, 1, 1, 1, 1); - quicklistRelease(ql); - quicklistRelease(copy); - } - - TEST("duplicate list of 500") { - quicklist *ql = quicklistNew(-2, options[_i]); - quicklistSetFill(ql, 32); - for (int i = 0; i < 500; i++) quicklistPushHead(ql, genstr("hello", i), 32); - ql_verify(ql, 16, 500, 20, 32); - - quicklist *copy = quicklistDup(ql); - ql_verify(copy, 16, 500, 20, 32); - quicklistRelease(ql); - quicklistRelease(copy); - } - - for (int f = 0; f < fill_count; f++) { - TEST_DESC("index 1,200 from 500 list at fill %d at compress %d", f, options[_i]) { - quicklist *ql = quicklistNew(fills[f], options[_i]); - for (int i = 0; i < 500; i++) quicklistPushTail(ql, genstr("hello", i + 1), 32); - quicklistEntry entry; - iter = quicklistGetIteratorEntryAtIdx(ql, 1, &entry); - if (strcmp((char *)entry.value, "hello2") != 0) ERR("Value: %s", entry.value); - ql_release_iterator(iter); - - iter = quicklistGetIteratorEntryAtIdx(ql, 200, &entry); - if (strcmp((char *)entry.value, "hello201") != 0) ERR("Value: %s", entry.value); - ql_release_iterator(iter); - quicklistRelease(ql); - } - - TEST_DESC("index -1,-2 from 500 list at fill %d at compress %d", fills[f], options[_i]) { - quicklist *ql = quicklistNew(fills[f], options[_i]); - for (int i = 0; i < 500; i++) quicklistPushTail(ql, genstr("hello", i + 1), 32); - quicklistEntry entry; - iter = quicklistGetIteratorEntryAtIdx(ql, -1, &entry); - if (strcmp((char *)entry.value, "hello500") != 0) ERR("Value: %s", entry.value); - ql_release_iterator(iter); - - iter = quicklistGetIteratorEntryAtIdx(ql, -2, &entry); - if (strcmp((char *)entry.value, "hello499") != 0) ERR("Value: %s", entry.value); - ql_release_iterator(iter); - quicklistRelease(ql); - } - - TEST_DESC("index -100 from 500 list at fill %d at compress %d", fills[f], options[_i]) { - quicklist *ql = quicklistNew(fills[f], options[_i]); - for (int i = 0; i < 500; i++) quicklistPushTail(ql, genstr("hello", i + 1), 32); - quicklistEntry entry; - iter = quicklistGetIteratorEntryAtIdx(ql, -100, &entry); - if (strcmp((char *)entry.value, "hello401") != 0) ERR("Value: %s", entry.value); - ql_release_iterator(iter); - quicklistRelease(ql); - } - - TEST_DESC("index too big +1 from 50 list at fill %d at compress %d", fills[f], options[_i]) { - quicklist *ql = quicklistNew(fills[f], options[_i]); - for (int i = 0; i < 50; i++) quicklistPushTail(ql, genstr("hello", i + 1), 32); - quicklistEntry entry; - int sz = entry.sz; - iter = quicklistGetIteratorEntryAtIdx(ql, 50, &entry); - if (iter) ERR("Index found at 50 with 50 list: %.*s", sz, entry.value); - ql_release_iterator(iter); - quicklistRelease(ql); - } - } - - TEST("delete range empty list") { - quicklist *ql = quicklistNew(-2, options[_i]); - quicklistDelRange(ql, 5, 20); - ql_verify(ql, 0, 0, 0, 0); - quicklistRelease(ql); - } - - TEST("delete range of entire node in list of one node") { - quicklist *ql = quicklistNew(-2, options[_i]); - for (int i = 0; i < 32; i++) quicklistPushHead(ql, genstr("hello", i), 32); - ql_verify(ql, 1, 32, 32, 32); - quicklistDelRange(ql, 0, 32); - ql_verify(ql, 0, 0, 0, 0); - quicklistRelease(ql); - } - - TEST("delete range of entire node with overflow counts") { - quicklist *ql = quicklistNew(-2, options[_i]); - for (int i = 0; i < 32; i++) quicklistPushHead(ql, genstr("hello", i), 32); - ql_verify(ql, 1, 32, 32, 32); - quicklistDelRange(ql, 0, 128); - ql_verify(ql, 0, 0, 0, 0); - quicklistRelease(ql); - } - - TEST("delete middle 100 of 500 list") { - quicklist *ql = quicklistNew(-2, options[_i]); - quicklistSetFill(ql, 32); - for (int i = 0; i < 500; i++) quicklistPushTail(ql, genstr("hello", i + 1), 32); - ql_verify(ql, 16, 500, 32, 20); - quicklistDelRange(ql, 200, 100); - ql_verify(ql, 14, 400, 32, 20); - quicklistRelease(ql); - } - - TEST("delete less than fill but across nodes") { - quicklist *ql = quicklistNew(-2, options[_i]); - quicklistSetFill(ql, 32); - for (int i = 0; i < 500; i++) quicklistPushTail(ql, genstr("hello", i + 1), 32); - ql_verify(ql, 16, 500, 32, 20); - quicklistDelRange(ql, 60, 10); - ql_verify(ql, 16, 490, 32, 20); - quicklistRelease(ql); - } - - TEST("delete negative 1 from 500 list") { - quicklist *ql = quicklistNew(-2, options[_i]); - quicklistSetFill(ql, 32); - for (int i = 0; i < 500; i++) quicklistPushTail(ql, genstr("hello", i + 1), 32); - ql_verify(ql, 16, 500, 32, 20); - quicklistDelRange(ql, -1, 1); - ql_verify(ql, 16, 499, 32, 19); - quicklistRelease(ql); - } - - TEST("delete negative 1 from 500 list with overflow counts") { - quicklist *ql = quicklistNew(-2, options[_i]); - quicklistSetFill(ql, 32); - for (int i = 0; i < 500; i++) quicklistPushTail(ql, genstr("hello", i + 1), 32); - ql_verify(ql, 16, 500, 32, 20); - quicklistDelRange(ql, -1, 128); - ql_verify(ql, 16, 499, 32, 19); - quicklistRelease(ql); - } - - TEST("delete negative 100 from 500 list") { - quicklist *ql = quicklistNew(-2, options[_i]); - quicklistSetFill(ql, 32); - for (int i = 0; i < 500; i++) quicklistPushTail(ql, genstr("hello", i + 1), 32); - quicklistDelRange(ql, -100, 100); - ql_verify(ql, 13, 400, 32, 16); - quicklistRelease(ql); - } - - TEST("delete -10 count 5 from 50 list") { - quicklist *ql = quicklistNew(-2, options[_i]); - quicklistSetFill(ql, 32); - for (int i = 0; i < 50; i++) quicklistPushTail(ql, genstr("hello", i + 1), 32); - ql_verify(ql, 2, 50, 32, 18); - quicklistDelRange(ql, -10, 5); - ql_verify(ql, 2, 45, 32, 13); - quicklistRelease(ql); - } - - TEST("numbers only list read") { - quicklist *ql = quicklistNew(-2, options[_i]); - quicklistPushTail(ql, "1111", 4); - quicklistPushTail(ql, "2222", 4); - quicklistPushTail(ql, "3333", 4); - quicklistPushTail(ql, "4444", 4); - ql_verify(ql, 1, 4, 4, 4); - quicklistEntry entry; - iter = quicklistGetIteratorEntryAtIdx(ql, 0, &entry); - if (entry.longval != 1111) ERR("Not 1111, %lld", entry.longval); - ql_release_iterator(iter); - - iter = quicklistGetIteratorEntryAtIdx(ql, 1, &entry); - if (entry.longval != 2222) ERR("Not 2222, %lld", entry.longval); - ql_release_iterator(iter); - - iter = quicklistGetIteratorEntryAtIdx(ql, 2, &entry); - if (entry.longval != 3333) ERR("Not 3333, %lld", entry.longval); - ql_release_iterator(iter); - - iter = quicklistGetIteratorEntryAtIdx(ql, 3, &entry); - if (entry.longval != 4444) ERR("Not 4444, %lld", entry.longval); - ql_release_iterator(iter); - - iter = quicklistGetIteratorEntryAtIdx(ql, 4, &entry); - if (iter) ERR("Index past elements: %lld", entry.longval); - ql_release_iterator(iter); - - iter = quicklistGetIteratorEntryAtIdx(ql, -1, &entry); - if (entry.longval != 4444) ERR("Not 4444 (reverse), %lld", entry.longval); - ql_release_iterator(iter); - - iter = quicklistGetIteratorEntryAtIdx(ql, -2, &entry); - if (entry.longval != 3333) ERR("Not 3333 (reverse), %lld", entry.longval); - ql_release_iterator(iter); - - iter = quicklistGetIteratorEntryAtIdx(ql, -3, &entry); - if (entry.longval != 2222) ERR("Not 2222 (reverse), %lld", entry.longval); - ql_release_iterator(iter); - - iter = quicklistGetIteratorEntryAtIdx(ql, -4, &entry); - if (entry.longval != 1111) ERR("Not 1111 (reverse), %lld", entry.longval); - ql_release_iterator(iter); - - iter = quicklistGetIteratorEntryAtIdx(ql, -5, &entry); - if (iter) ERR("Index past elements (reverse), %lld", entry.longval); - ql_release_iterator(iter); - quicklistRelease(ql); - } - - TEST("numbers larger list read") { - quicklist *ql = quicklistNew(-2, options[_i]); - quicklistSetFill(ql, 32); - char num[32]; - long long nums[5000]; - for (int i = 0; i < 5000; i++) { - nums[i] = -5157318210846258176 + i; - int sz = ll2string(num, sizeof(num), nums[i]); - quicklistPushTail(ql, num, sz); - } - quicklistPushTail(ql, "xxxxxxxxxxxxxxxxxxxx", 20); - quicklistEntry entry; - for (int i = 0; i < 5000; i++) { - iter = quicklistGetIteratorEntryAtIdx(ql, i, &entry); - if (entry.longval != nums[i]) ERR("[%d] Not longval %lld but rather %lld", i, nums[i], entry.longval); - entry.longval = 0xdeadbeef; - ql_release_iterator(iter); - } - iter = quicklistGetIteratorEntryAtIdx(ql, 5000, &entry); - if (strncmp((char *)entry.value, "xxxxxxxxxxxxxxxxxxxx", 20)) ERR("String val not match: %s", entry.value); - ql_verify(ql, 157, 5001, 32, 9); - ql_release_iterator(iter); - quicklistRelease(ql); - } - - TEST("numbers larger list read B") { - quicklist *ql = quicklistNew(-2, options[_i]); - quicklistPushTail(ql, "99", 2); - quicklistPushTail(ql, "98", 2); - quicklistPushTail(ql, "xxxxxxxxxxxxxxxxxxxx", 20); - quicklistPushTail(ql, "96", 2); - quicklistPushTail(ql, "95", 2); - quicklistReplaceAtIndex(ql, 1, "foo", 3); - quicklistReplaceAtIndex(ql, -1, "bar", 3); - quicklistRelease(ql); - } - - TEST_DESC("lrem test at compress %d", options[_i]) { - for (int f = 0; f < fill_count; f++) { - quicklist *ql = quicklistNew(fills[f], options[_i]); - char *words[] = {"abc", "foo", "bar", "foobar", "foobared", "zap", "bar", "test", "foo"}; - char *result[] = {"abc", "foo", "foobar", "foobared", "zap", "test", "foo"}; - char *resultB[] = {"abc", "foo", "foobar", "foobared", "zap", "test"}; - for (int i = 0; i < 9; i++) quicklistPushTail(ql, words[i], strlen(words[i])); - - /* lrem 0 bar */ - quicklistIter *iter = quicklistGetIterator(ql, AL_START_HEAD); - quicklistEntry entry; - int i = 0; - while (quicklistNext(iter, &entry)) { - if (quicklistCompare(&entry, (unsigned char *)"bar", 3)) { - quicklistDelEntry(iter, &entry); - } - i++; - } - ql_release_iterator(iter); - - /* check result of lrem 0 bar */ - iter = quicklistGetIterator(ql, AL_START_HEAD); - i = 0; - while (quicklistNext(iter, &entry)) { - /* Result must be: abc, foo, foobar, foobared, zap, test, - * foo */ - int sz = entry.sz; - if (strncmp((char *)entry.value, result[i], entry.sz)) { - ERR("No match at position %d, got %.*s instead of %s", i, sz, entry.value, result[i]); - } - i++; - } - ql_release_iterator(iter); - - quicklistPushTail(ql, "foo", 3); - - /* lrem -2 foo */ - iter = quicklistGetIterator(ql, AL_START_TAIL); - i = 0; - int del = 2; - while (quicklistNext(iter, &entry)) { - if (quicklistCompare(&entry, (unsigned char *)"foo", 3)) { - quicklistDelEntry(iter, &entry); - del--; - } - if (!del) break; - i++; - } - ql_release_iterator(iter); - - /* check result of lrem -2 foo */ - /* (we're ignoring the '2' part and still deleting all foo - * because - * we only have two foo) */ - iter = quicklistGetIterator(ql, AL_START_TAIL); - i = 0; - size_t resB = sizeof(resultB) / sizeof(*resultB); - while (quicklistNext(iter, &entry)) { - /* Result must be: abc, foo, foobar, foobared, zap, test, - * foo */ - int sz = entry.sz; - if (strncmp((char *)entry.value, resultB[resB - 1 - i], sz)) { - ERR("No match at position %d, got %.*s instead of %s", i, sz, entry.value, - resultB[resB - 1 - i]); - } - i++; - } - - ql_release_iterator(iter); - quicklistRelease(ql); - } - } - - TEST_DESC("iterate reverse + delete at compress %d", options[_i]) { - for (int f = 0; f < fill_count; f++) { - quicklist *ql = quicklistNew(fills[f], options[_i]); - quicklistPushTail(ql, "abc", 3); - quicklistPushTail(ql, "def", 3); - quicklistPushTail(ql, "hij", 3); - quicklistPushTail(ql, "jkl", 3); - quicklistPushTail(ql, "oop", 3); - - quicklistEntry entry; - quicklistIter *iter = quicklistGetIterator(ql, AL_START_TAIL); - int i = 0; - while (quicklistNext(iter, &entry)) { - if (quicklistCompare(&entry, (unsigned char *)"hij", 3)) { - quicklistDelEntry(iter, &entry); - } - i++; - } - ql_release_iterator(iter); - - if (i != 5) ERR("Didn't iterate 5 times, iterated %d times.", i); - - /* Check results after deletion of "hij" */ - iter = quicklistGetIterator(ql, AL_START_HEAD); - i = 0; - char *vals[] = {"abc", "def", "jkl", "oop"}; - while (quicklistNext(iter, &entry)) { - if (!quicklistCompare(&entry, (unsigned char *)vals[i], 3)) { - ERR("Value at %d didn't match %s\n", i, vals[i]); - } - i++; - } - ql_release_iterator(iter); - quicklistRelease(ql); - } - } - - TEST_DESC("iterator at index test at compress %d", options[_i]) { - for (int f = 0; f < fill_count; f++) { - quicklist *ql = quicklistNew(fills[f], options[_i]); - char num[32]; - long long nums[5000]; - for (int i = 0; i < 760; i++) { - nums[i] = -5157318210846258176 + i; - int sz = ll2string(num, sizeof(num), nums[i]); - quicklistPushTail(ql, num, sz); - } - - quicklistEntry entry; - quicklistIter *iter = quicklistGetIteratorAtIdx(ql, AL_START_HEAD, 437); - int i = 437; - while (quicklistNext(iter, &entry)) { - if (entry.longval != nums[i]) ERR("Expected %lld, but got %lld", entry.longval, nums[i]); - i++; - } - ql_release_iterator(iter); - quicklistRelease(ql); - } - } - - TEST_DESC("ltrim test A at compress %d", options[_i]) { - for (int f = 0; f < fill_count; f++) { - quicklist *ql = quicklistNew(fills[f], options[_i]); - char num[32]; - long long nums[5000]; - for (int i = 0; i < 32; i++) { - nums[i] = -5157318210846258176 + i; - int sz = ll2string(num, sizeof(num), nums[i]); - quicklistPushTail(ql, num, sz); - } - if (fills[f] == 32) ql_verify(ql, 1, 32, 32, 32); - /* ltrim 25 53 (keep [25,32] inclusive = 7 remaining) */ - quicklistDelRange(ql, 0, 25); - quicklistDelRange(ql, 0, 0); - quicklistEntry entry; - for (int i = 0; i < 7; i++) { - iter = quicklistGetIteratorEntryAtIdx(ql, i, &entry); - if (entry.longval != nums[25 + i]) - ERR("Deleted invalid range! Expected %lld but got " - "%lld", - entry.longval, nums[25 + i]); - ql_release_iterator(iter); - } - if (fills[f] == 32) ql_verify(ql, 1, 7, 7, 7); - quicklistRelease(ql); - } - } - - TEST_DESC("ltrim test B at compress %d", options[_i]) { - for (int f = 0; f < fill_count; f++) { - /* Force-disable compression because our 33 sequential - * integers don't compress and the check always fails. */ - quicklist *ql = quicklistNew(fills[f], QUICKLIST_NOCOMPRESS); - char num[32]; - long long nums[5000]; - for (int i = 0; i < 33; i++) { - nums[i] = i; - int sz = ll2string(num, sizeof(num), nums[i]); - quicklistPushTail(ql, num, sz); - } - if (fills[f] == 32) ql_verify(ql, 2, 33, 32, 1); - /* ltrim 5 16 (keep [5,16] inclusive = 12 remaining) */ - quicklistDelRange(ql, 0, 5); - quicklistDelRange(ql, -16, 16); - if (fills[f] == 32) ql_verify(ql, 1, 12, 12, 12); - quicklistEntry entry; - - iter = quicklistGetIteratorEntryAtIdx(ql, 0, &entry); - if (entry.longval != 5) ERR("A: longval not 5, but %lld", entry.longval); - ql_release_iterator(iter); - - iter = quicklistGetIteratorEntryAtIdx(ql, -1, &entry); - if (entry.longval != 16) ERR("B! got instead: %lld", entry.longval); - quicklistPushTail(ql, "bobobob", 7); - ql_release_iterator(iter); - - iter = quicklistGetIteratorEntryAtIdx(ql, -1, &entry); - int sz = entry.sz; - if (strncmp((char *)entry.value, "bobobob", 7)) - ERR("Tail doesn't match bobobob, it's %.*s instead", sz, entry.value); - ql_release_iterator(iter); - - for (int i = 0; i < 12; i++) { - iter = quicklistGetIteratorEntryAtIdx(ql, i, &entry); - if (entry.longval != nums[5 + i]) - ERR("Deleted invalid range! Expected %lld but got " - "%lld", - entry.longval, nums[5 + i]); - ql_release_iterator(iter); - } - quicklistRelease(ql); - } - } - - TEST_DESC("ltrim test C at compress %d", options[_i]) { - for (int f = 0; f < fill_count; f++) { - quicklist *ql = quicklistNew(fills[f], options[_i]); - char num[32]; - long long nums[5000]; - for (int i = 0; i < 33; i++) { - nums[i] = -5157318210846258176 + i; - int sz = ll2string(num, sizeof(num), nums[i]); - quicklistPushTail(ql, num, sz); - } - if (fills[f] == 32) ql_verify(ql, 2, 33, 32, 1); - /* ltrim 3 3 (keep [3,3] inclusive = 1 remaining) */ - quicklistDelRange(ql, 0, 3); - quicklistDelRange(ql, -29, 4000); /* make sure not loop forever */ - if (fills[f] == 32) ql_verify(ql, 1, 1, 1, 1); - quicklistEntry entry; - iter = quicklistGetIteratorEntryAtIdx(ql, 0, &entry); - if (entry.longval != -5157318210846258173) ERROR; - ql_release_iterator(iter); - quicklistRelease(ql); - } - } - - TEST_DESC("ltrim test D at compress %d", options[_i]) { - for (int f = 0; f < fill_count; f++) { - quicklist *ql = quicklistNew(fills[f], options[_i]); - char num[32]; - long long nums[5000]; - for (int i = 0; i < 33; i++) { - nums[i] = -5157318210846258176 + i; - int sz = ll2string(num, sizeof(num), nums[i]); - quicklistPushTail(ql, num, sz); - } - if (fills[f] == 32) ql_verify(ql, 2, 33, 32, 1); - quicklistDelRange(ql, -12, 3); - if (ql->count != 30) ERR("Didn't delete exactly three elements! Count is: %lu", ql->count); - quicklistRelease(ql); - } - } - - long long stop = mstime(); - runtime[_i] = stop - start; - } - - /* Run a longer test of compression depth outside of primary test loop. */ - int list_sizes[] = {250, 251, 500, 999, 1000}; - long long start = mstime(); - int list_count = accurate ? (int)(sizeof(list_sizes) / sizeof(*list_sizes)) : 1; - for (int list = 0; list < list_count; list++) { - TEST_DESC("verify specific compression of interior nodes with %d list ", list_sizes[list]) { - for (int f = 0; f < fill_count; f++) { - for (int depth = 1; depth < 40; depth++) { - /* skip over many redundant test cases */ - quicklist *ql = quicklistNew(fills[f], depth); - for (int i = 0; i < list_sizes[list]; i++) { - quicklistPushTail(ql, genstr("hello TAIL", i + 1), 64); - quicklistPushHead(ql, genstr("hello HEAD", i + 1), 64); - } - - for (int step = 0; step < 2; step++) { - /* test remove node */ - if (step == 1) { - for (int i = 0; i < list_sizes[list] / 2; i++) { - unsigned char *data; - assert(quicklistPop(ql, QUICKLIST_HEAD, &data, NULL, NULL)); - zfree(data); - assert(quicklistPop(ql, QUICKLIST_TAIL, &data, NULL, NULL)); - zfree(data); - } - } - quicklistNode *node = ql->head; - unsigned int low_raw = ql->compress; - unsigned int high_raw = ql->len - ql->compress; - - for (unsigned int at = 0; at < ql->len; at++, node = node->next) { - if (at < low_raw || at >= high_raw) { - if (node->encoding != QUICKLIST_NODE_ENCODING_RAW) { - ERR("Incorrect compression: node %d is " - "compressed at depth %d ((%u, %u); total " - "nodes: %lu; size: %zu)", - at, depth, low_raw, high_raw, ql->len, node->sz); - } - } else { - if (node->encoding != QUICKLIST_NODE_ENCODING_LZF) { - ERR("Incorrect non-compression: node %d is NOT " - "compressed at depth %d ((%u, %u); total " - "nodes: %lu; size: %zu; attempted: %d)", - at, depth, low_raw, high_raw, ql->len, node->sz, node->attempted_compress); - } - } - } - } - - quicklistRelease(ql); - } - } - } - } - long long stop = mstime(); - - printf("\n"); - for (size_t i = 0; i < option_count; i++) - printf("Test Loop %02d: %0.2f seconds.\n", options[i], (float)runtime[i] / 1000); - printf("Compressions: %0.2f seconds.\n", (float)(stop - start) / 1000); - printf("\n"); - - TEST("bookmark get updated to next item") { - quicklist *ql = quicklistNew(1, 0); - quicklistPushTail(ql, "1", 1); - quicklistPushTail(ql, "2", 1); - quicklistPushTail(ql, "3", 1); - quicklistPushTail(ql, "4", 1); - quicklistPushTail(ql, "5", 1); - assert(ql->len == 5); - /* add two bookmarks, one pointing to the node before the last. */ - assert(quicklistBookmarkCreate(&ql, "_dummy", ql->head->next)); - assert(quicklistBookmarkCreate(&ql, "_test", ql->tail->prev)); - /* test that the bookmark returns the right node, delete it and see that the bookmark points to the last node */ - assert(quicklistBookmarkFind(ql, "_test") == ql->tail->prev); - assert(quicklistDelRange(ql, -2, 1)); - assert(quicklistBookmarkFind(ql, "_test") == ql->tail); - /* delete the last node, and see that the bookmark was deleted. */ - assert(quicklistDelRange(ql, -1, 1)); - assert(quicklistBookmarkFind(ql, "_test") == NULL); - /* test that other bookmarks aren't affected */ - assert(quicklistBookmarkFind(ql, "_dummy") == ql->head->next); - assert(quicklistBookmarkFind(ql, "_missing") == NULL); - assert(ql->len == 3); - quicklistBookmarksClear(ql); /* for coverage */ - assert(quicklistBookmarkFind(ql, "_dummy") == NULL); - quicklistRelease(ql); - } - - TEST("bookmark limit") { - int i; - quicklist *ql = quicklistNew(1, 0); - quicklistPushHead(ql, "1", 1); - for (i = 0; i < QL_MAX_BM; i++) assert(quicklistBookmarkCreate(&ql, genstr("", i), ql->head)); - /* when all bookmarks are used, creation fails */ - assert(!quicklistBookmarkCreate(&ql, "_test", ql->head)); - /* delete one and see that we can now create another */ - assert(quicklistBookmarkDelete(ql, "0")); - assert(quicklistBookmarkCreate(&ql, "_test", ql->head)); - /* delete one and see that the rest survive */ - assert(quicklistBookmarkDelete(ql, "_test")); - for (i = 1; i < QL_MAX_BM; i++) assert(quicklistBookmarkFind(ql, genstr("", i)) == ql->head); - /* make sure the deleted ones are indeed gone */ - assert(!quicklistBookmarkFind(ql, "0")); - assert(!quicklistBookmarkFind(ql, "_test")); - quicklistRelease(ql); - } - - if (flags & TEST_LARGE_MEMORY) { - TEST("compress and decompress quicklist listpack node") { - quicklistNode *node = quicklistCreateNode(); - node->entry = lpNew(0); - - /* Just to avoid triggering the assertion in __quicklistCompressNode(), - * it disables the passing of quicklist head or tail node. */ - node->prev = quicklistCreateNode(); - node->next = quicklistCreateNode(); - - /* Create a rand string */ - size_t sz = (1 << 25); /* 32MB per one entry */ - unsigned char *s = zmalloc(sz); - randstring(s, sz); - - /* Keep filling the node, until it reaches 1GB */ - for (int i = 0; i < 32; i++) { - node->entry = lpAppend(node->entry, s, sz); - quicklistNodeUpdateSz(node); - - long long start = mstime(); - assert(__quicklistCompressNode(node)); - assert(__quicklistDecompressNode(node)); - printf("Compress and decompress: %zu MB in %.2f seconds.\n", node->sz / 1024 / 1024, - (float)(mstime() - start) / 1000); - } - - zfree(s); - zfree(node->prev); - zfree(node->next); - zfree(node->entry); - zfree(node); - } - -#if ULONG_MAX >= 0xffffffffffffffff - TEST("compress and decomress quicklist plain node large than UINT32_MAX") { - size_t sz = (1ull << 32); - unsigned char *s = zmalloc(sz); - randstring(s, sz); - memcpy(s, "helloworld", 10); - memcpy(s + sz - 10, "1234567890", 10); - - quicklistNode *node = __quicklistCreateNode(QUICKLIST_NODE_CONTAINER_PLAIN, s, sz); - - /* Just to avoid triggering the assertion in __quicklistCompressNode(), - * it disables the passing of quicklist head or tail node. */ - node->prev = quicklistCreateNode(); - node->next = quicklistCreateNode(); - - long long start = mstime(); - assert(__quicklistCompressNode(node)); - assert(__quicklistDecompressNode(node)); - printf("Compress and decompress: %zu MB in %.2f seconds.\n", node->sz / 1024 / 1024, - (float)(mstime() - start) / 1000); - - assert(memcmp(node->entry, "helloworld", 10) == 0); - assert(memcmp(node->entry + sz - 10, "1234567890", 10) == 0); - zfree(node->prev); - zfree(node->next); - zfree(node->entry); - zfree(node); - } -#endif - } - - if (!err) - printf("ALL TESTS PASSED!\n"); - else - ERR("Sorry, not all tests passed! In fact, %d tests failed.", err); - - return err; -} -#endif diff --git a/src/quicklist.h b/src/quicklist.h index bb94807913..4411f823b0 100644 --- a/src/quicklist.h +++ b/src/quicklist.h @@ -198,10 +198,6 @@ quicklistNode *quicklistBookmarkFind(quicklist *ql, const char *name); void quicklistBookmarksClear(quicklist *ql); int quicklistSetPackedThreshold(size_t sz); -#ifdef SERVER_TEST -int quicklistTest(int argc, char *argv[], int flags); -#endif - /* Directions for iterators */ #define AL_START_HEAD 0 #define AL_START_TAIL 1 diff --git a/src/rdb.c b/src/rdb.c index d5f3d8482b..d4c28e2f2e 100644 --- a/src/rdb.c +++ b/src/rdb.c @@ -49,6 +49,9 @@ #include #include +/* Size of the static buffer used for rdbcompression */ +#define LZF_STATIC_BUFFER_SIZE (8 * 1024) + /* This macro is called when the internal RDB structure is corrupt */ #define rdbReportCorruptRDB(...) rdbReportError(1, __LINE__, __VA_ARGS__) /* This macro is called when RDB read failed (possibly a short read) */ @@ -389,18 +392,20 @@ ssize_t rdbSaveLzfBlob(rio *rdb, void *data, size_t compress_len, size_t origina ssize_t rdbSaveLzfStringObject(rio *rdb, unsigned char *s, size_t len) { size_t comprlen, outlen; void *out; + static void *buffer = NULL; /* We require at least four bytes compression for this to be worth it */ if (len <= 4) return 0; outlen = len - 4; - if ((out = zmalloc(outlen + 1)) == NULL) return 0; - comprlen = lzf_compress(s, len, out, outlen); - if (comprlen == 0) { - zfree(out); - return 0; + if (outlen < LZF_STATIC_BUFFER_SIZE) { + if (!buffer) buffer = zmalloc(LZF_STATIC_BUFFER_SIZE); + out = buffer; + } else { + if ((out = zmalloc(outlen + 1)) == NULL) return 0; } - ssize_t nwritten = rdbSaveLzfBlob(rdb, out, comprlen, len); - zfree(out); + comprlen = lzf_compress(s, len, out, outlen); + ssize_t nwritten = comprlen ? rdbSaveLzfBlob(rdb, out, comprlen, len) : 0; + if (out != buffer) zfree(out); return nwritten; } diff --git a/src/rdma.c b/src/rdma.c index 15e23758b7..7fe65ad2d2 100644 --- a/src/rdma.c +++ b/src/rdma.c @@ -10,9 +10,10 @@ #define VALKEYMODULE_CORE_MODULE #include "server.h" - -#if defined USE_RDMA && defined __linux__ /* currently RDMA is only supported on Linux */ #include "connection.h" + +#if defined __linux__ /* currently RDMA is only supported on Linux */ +#if (USE_RDMA == 1 /* BUILD_YES */) || ((USE_RDMA == 2 /* BUILD_MODULE */) && (BUILD_RDMA_MODULE == 2)) #include "connhelpers.h" #include @@ -76,9 +77,12 @@ typedef enum ValkeyRdmaOpcode { #define VALKEY_RDMA_INVALID_OPCODE 0xffff #define VALKEY_RDMA_KEEPALIVE_MS 3000 +#define RDMA_CONN_FLAG_POSTPONE_UPDATE_STATE (1 << 0) + typedef struct rdma_connection { connection c; struct rdma_cm_id *cm_id; + int flags; int last_errno; listNode *pending_list_node; } rdma_connection; @@ -128,12 +132,10 @@ typedef struct rdma_listener { static list *pending_list; static rdma_listener *rdma_listeners; +static serverRdmaContextConfig *rdma_config; static ConnectionType CT_RDMA; -static int valkey_rdma_rx_size = VALKEY_RDMA_DEFAULT_RX_SIZE; -static int valkey_rdma_comp_vector = -1; /* -1 means a random one */ - static void serverRdmaError(char *err, const char *fmt, ...) { va_list ap; @@ -143,12 +145,34 @@ static void serverRdmaError(char *err, const char *fmt, ...) { va_end(ap); } +static inline int connRdmaAllowCommand(void) { + /* RDMA MR is not accessible in a child process, avoid segment fault due to + * invalid MR access, close it rather than server random crash */ + if (server.in_fork_child != CHILD_TYPE_NONE) { + return C_ERR; + } + + return C_OK; +} + +static inline int connRdmaAllowRW(connection *conn) { + if (conn->state == CONN_STATE_ERROR || conn->state == CONN_STATE_CLOSED) { + return C_ERR; + } + + return connRdmaAllowCommand(); +} + static int rdmaPostRecv(RdmaContext *ctx, struct rdma_cm_id *cm_id, ValkeyRdmaCmd *cmd) { struct ibv_sge sge; size_t length = sizeof(ValkeyRdmaCmd); struct ibv_recv_wr recv_wr, *bad_wr; int ret; + if (connRdmaAllowCommand()) { + return C_ERR; + } + sge.addr = (uint64_t)cmd; sge.length = length; sge.lkey = ctx->cmd_mr->lkey; @@ -250,7 +274,7 @@ static int rdmaSetupIoBuf(RdmaContext *ctx, struct rdma_cm_id *cm_id) { /* setup recv buf & MR */ access = IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_READ | IBV_ACCESS_REMOTE_WRITE; - length = valkey_rdma_rx_size; + length = rdma_config->rx_size; ctx->rx.addr = page_aligned_zalloc(length); ctx->rx.length = length; ctx->rx.mr = ibv_reg_mr(ctx->pd, ctx->rx.addr, length, access); @@ -273,6 +297,7 @@ static int rdmaCreateResource(RdmaContext *ctx, struct rdma_cm_id *cm_id) { struct ibv_comp_channel *comp_channel = NULL; struct ibv_cq *cq = NULL; struct ibv_pd *pd = NULL; + int comp_vector = rdma_config->completion_vector; if (ibv_query_device(cm_id->verbs, &device_attr)) { serverLog(LL_WARNING, "RDMA: ibv ibv query device failed"); @@ -295,8 +320,13 @@ static int rdmaCreateResource(RdmaContext *ctx, struct rdma_cm_id *cm_id) { ctx->comp_channel = comp_channel; + /* negative number means a random one */ + if (comp_vector < 0) { + comp_vector = abs((int)random()); + } + cq = ibv_create_cq(cm_id->verbs, VALKEY_RDMA_MAX_WQE * 2, NULL, comp_channel, - valkey_rdma_comp_vector % cm_id->verbs->num_comp_vectors); + comp_vector % cm_id->verbs->num_comp_vectors); if (!cq) { serverLog(LL_WARNING, "RDMA: ibv create cq failed"); return C_ERR; @@ -451,13 +481,22 @@ static int rdmaHandleEstablished(struct rdma_cm_event *ev) { return C_OK; } +static inline void rdmaDelKeepalive(aeEventLoop *el, RdmaContext *ctx) { + if (ctx->keepalive_te == AE_ERR) { + return; + } + + aeDeleteTimeEvent(el, ctx->keepalive_te); + ctx->keepalive_te = AE_ERR; +} + static int rdmaHandleDisconnect(aeEventLoop *el, struct rdma_cm_event *ev) { struct rdma_cm_id *cm_id = ev->id; RdmaContext *ctx = cm_id->context; connection *conn = ctx->conn; rdma_connection *rdma_conn = (rdma_connection *)conn; - aeDeleteTimeEvent(el, ctx->keepalive_te); + rdmaDelKeepalive(el, ctx); conn->state = CONN_STATE_CLOSED; /* we can't close connection now, let's mark this connection as closed state */ @@ -657,7 +696,7 @@ static void connRdmaEventHandler(struct aeEventLoop *el, int fd, void *clientDat } /* uplayer should read all */ - while (ctx->rx.pos < ctx->rx.offset) { + while (!(rdma_conn->flags & RDMA_CONN_FLAG_POSTPONE_UPDATE_STATE) && ctx->rx.pos < ctx->rx.offset) { if (conn->read_handler && (callHandler(conn, conn->read_handler) == C_ERR)) { return; } @@ -669,7 +708,7 @@ static void connRdmaEventHandler(struct aeEventLoop *el, int fd, void *clientDat } /* RDMA comp channel has no POLLOUT event, try to send remaining buffer */ - if ((ctx->tx.offset < ctx->tx.length) && conn->write_handler) { + if (!(rdma_conn->flags & RDMA_CONN_FLAG_POSTPONE_UPDATE_STATE) && ctx->tx.offset < ctx->tx.length && conn->write_handler) { callHandler(conn, conn->write_handler); } } @@ -848,6 +887,9 @@ static void connRdmaAcceptHandler(aeEventLoop *el, int fd, void *privdata, int m } static int connRdmaSetRwHandler(connection *conn) { + rdma_connection *rdma_conn = (rdma_connection *)conn; + if (rdma_conn->flags & RDMA_CONN_FLAG_POSTPONE_UPDATE_STATE) return C_OK; + /* IB channel only has POLLIN event */ if (conn->read_handler || conn->write_handler) { if (aeCreateFileEvent(server.el, conn->fd, AE_READABLE, conn->type->ae_handler, conn) == AE_ERR) { @@ -1168,11 +1210,20 @@ static void connRdmaClose(connection *conn) { conn->fd = -1; } + /* If called from within a handler, schedule the close but + * keep the connection until the handler returns. + */ + if (connHasRefs(conn)) { + conn->flags |= CONN_FLAG_CLOSE_SCHEDULED; + return; + } + if (!cm_id) { return; } ctx = cm_id->context; + rdmaDelKeepalive(server.el, ctx); rdma_disconnect(cm_id); /* poll all CQ before close */ @@ -1204,6 +1255,10 @@ static size_t connRdmaSend(connection *conn, const void *data, size_t data_len) char *remote_addr = ctx->tx_addr + ctx->tx.offset; int ret; + if (connRdmaAllowCommand()) { + return C_ERR; + } + memcpy(addr, data, data_len); sge.addr = (uint64_t)addr; @@ -1237,7 +1292,7 @@ static int connRdmaWrite(connection *conn, const void *data, size_t data_len) { RdmaContext *ctx = cm_id->context; uint32_t towrite; - if (conn->state == CONN_STATE_ERROR || conn->state == CONN_STATE_CLOSED) { + if (connRdmaAllowRW(conn)) { return C_ERR; } @@ -1280,7 +1335,7 @@ static int connRdmaRead(connection *conn, void *buf, size_t buf_len) { struct rdma_cm_id *cm_id = rdma_conn->cm_id; RdmaContext *ctx = cm_id->context; - if (conn->state == CONN_STATE_ERROR || conn->state == CONN_STATE_CLOSED) { + if (connRdmaAllowRW(conn)) { return C_ERR; } @@ -1302,7 +1357,7 @@ static ssize_t connRdmaSyncWrite(connection *conn, char *ptr, ssize_t size, long long long start = mstime(); uint32_t towrite; - if (conn->state == CONN_STATE_ERROR || conn->state == CONN_STATE_CLOSED) { + if (connRdmaAllowRW(conn)) { return C_ERR; } @@ -1345,7 +1400,7 @@ static ssize_t connRdmaSyncRead(connection *conn, char *ptr, ssize_t size, long long long start = mstime(); uint32_t toread; - if (conn->state == CONN_STATE_ERROR || conn->state == CONN_STATE_CLOSED) { + if (connRdmaAllowRW(conn)) { return C_ERR; } @@ -1380,7 +1435,7 @@ static ssize_t connRdmaSyncReadLine(connection *conn, char *ptr, ssize_t size, l char *c; char nl = 0; - if (conn->state == CONN_STATE_ERROR || conn->state == CONN_STATE_CLOSED) { + if (connRdmaAllowRW(conn)) { return C_ERR; } @@ -1566,9 +1621,28 @@ int connRdmaListen(connListener *listener) { rdma_listener++; } + rdma_config = listener->priv; return C_OK; } +static void connRdmaCloseListener(connListener *listener) { + /* Close old servers */ + for (int i = 0; i < listener->count; i++) { + if (listener->fd[i] == -1) continue; + + aeDeleteFileEvent(server.el, listener->fd[i], AE_READABLE); + listener->fd[i] = -1; + struct rdma_listener *rdma_listener = &rdma_listeners[i]; + rdma_destroy_id(rdma_listener->cm_id); + rdma_destroy_event_channel(rdma_listener->cm_channel); + } + + listener->count = 0; + zfree(rdma_listeners); + rdma_listeners = NULL; + rdma_config = NULL; +} + static int connRdmaAddr(connection *conn, char *ip, size_t ip_len, int *port, int remote) { rdma_connection *rdma_conn = (rdma_connection *)conn; struct rdma_cm_id *cm_id = rdma_conn->cm_id; @@ -1653,36 +1727,52 @@ static int rdmaProcessPendingData(void) { listNode *ln; rdma_connection *rdma_conn; connection *conn; - listNode *node; - int processed; + int processed = 0; - processed = listLength(pending_list); listRewind(pending_list, &li); while ((ln = listNext(&li))) { rdma_conn = listNodeValue(ln); + if (rdma_conn->flags & RDMA_CONN_FLAG_POSTPONE_UPDATE_STATE) continue; conn = &rdma_conn->c; - node = rdma_conn->pending_list_node; /* a connection can be disconnected by remote peer, CM event mark state as CONN_STATE_CLOSED, kick connection * read/write handler to close connection */ if (conn->state == CONN_STATE_ERROR || conn->state == CONN_STATE_CLOSED) { - listDelNode(pending_list, node); - /* do NOT call callHandler(conn, conn->read_handler) here, conn is freed in handler! */ - if (conn->read_handler) { - conn->read_handler(conn); - } else if (conn->write_handler) { - conn->write_handler(conn); + listDelNode(pending_list, rdma_conn->pending_list_node); + rdma_conn->pending_list_node = NULL; + /* Invoke both read_handler and write_handler, unless read_handler + returns 0, indicating the connection has closed, in which case + write_handler will be skipped. */ + if (callHandler(conn, conn->read_handler)) { + callHandler(conn, conn->write_handler); } + ++processed; continue; } connRdmaEventHandler(NULL, -1, rdma_conn, 0); + ++processed; } return processed; } +static void postPoneUpdateRdmaState(struct connection *conn, int postpone) { + rdma_connection *rdma_conn = (rdma_connection *)conn; + if (postpone) { + rdma_conn->flags |= RDMA_CONN_FLAG_POSTPONE_UPDATE_STATE; + } else { + rdma_conn->flags &= ~RDMA_CONN_FLAG_POSTPONE_UPDATE_STATE; + } +} + +static void updateRdmaState(struct connection *conn) { + rdma_connection *rdma_conn = (rdma_connection *)conn; + connRdmaSetRwHandler(conn); + connRdmaEventHandler(NULL, -1, rdma_conn, 0); +} + static ConnectionType CT_RDMA = { /* connection type */ .get_type = connRdmaGetType, @@ -1697,6 +1787,7 @@ static ConnectionType CT_RDMA = { //.cluster_accept_handler = NULL, .is_local = connRdmaIsLocal, .listen = connRdmaListen, + .closeListener = connRdmaCloseListener, .addr = connRdmaAddr, /* create/close connection */ @@ -1724,19 +1815,10 @@ static ConnectionType CT_RDMA = { /* pending data */ .has_pending_data = rdmaHasPendingData, .process_pending_data = rdmaProcessPendingData, + .postpone_update_state = postPoneUpdateRdmaState, + .update_state = updateRdmaState, }; -static struct connListener *rdmaListener(void) { - static struct connListener *listener = NULL; - - if (listener) return listener; - - listener = listenerByType(CONN_TYPE_RDMA); - serverAssert(listener != NULL); - - return listener; -} - ConnectionType *connectionTypeRdma(void) { static ConnectionType *ct_rdma = NULL; @@ -1748,133 +1830,28 @@ ConnectionType *connectionTypeRdma(void) { return ct_rdma; } -/* rdma listener has different create/close logic from TCP, we can't re-use 'int changeListener(connListener *listener)' - * directly */ -static int rdmaChangeListener(void) { - struct connListener *listener = rdmaListener(); - - /* Close old servers */ - for (int i = 0; i < listener->count; i++) { - if (listener->fd[i] == -1) continue; - - aeDeleteFileEvent(server.el, listener->fd[i], AE_READABLE); - listener->fd[i] = -1; - struct rdma_listener *rdma_listener = &rdma_listeners[i]; - rdma_destroy_id(rdma_listener->cm_id); - rdma_destroy_event_channel(rdma_listener->cm_channel); - } - - listener->count = 0; - zfree(rdma_listeners); - rdma_listeners = NULL; - - closeListener(listener); - - /* Just close the server if port disabled */ - if (listener->port == 0) { - if (server.set_proc_title) serverSetProcTitle(NULL); - return VALKEYMODULE_OK; - } - - /* Re-create listener */ - if (connListen(listener) != C_OK) { - return VALKEYMODULE_ERR; - } - - /* Create event handlers */ - if (createSocketAcceptHandler(listener, listener->ct->accept_handler) != C_OK) { - serverPanic("Unrecoverable error creating %s accept handler.", listener->ct->get_type(NULL)); - } - - if (server.set_proc_title) serverSetProcTitle(NULL); - - return VALKEYMODULE_OK; -} - -#ifdef BUILD_RDMA_MODULE - -#include "release.h" - -static long long rdmaGetPort(const char *name, void *privdata) { - UNUSED(name); - UNUSED(privdata); - struct connListener *listener = rdmaListener(); - - return listener->port; -} - -static int rdmaSetPort(const char *name, long long val, void *privdata, ValkeyModuleString **err) { - UNUSED(name); - UNUSED(privdata); - UNUSED(err); - struct connListener *listener = rdmaListener(); - listener->port = val; - - return VALKEYMODULE_OK; +int RegisterConnectionTypeRdma(void) { + return connTypeRegister(&CT_RDMA); } -static ValkeyModuleString *rdma_bind; +#else -static void rdmaBuildBind(void *ctx) { - struct connListener *listener = rdmaListener(); - - if (rdma_bind) ValkeyModule_FreeString(NULL, rdma_bind); - - sds rdma_bind_str = sdsjoin(listener->bindaddr, listener->bindaddr_count, " "); - rdma_bind = ValkeyModule_CreateString(ctx, rdma_bind_str, sdslen(rdma_bind_str)); -} - -static ValkeyModuleString *rdmaGetBind(const char *name, void *privdata) { - UNUSED(name); - UNUSED(privdata); - - return rdma_bind; +int RegisterConnectionTypeRdma(void) { + serverLog(LL_VERBOSE, "Connection type %s not builtin", CONN_TYPE_RDMA); + return C_ERR; } -static int rdmaSetBind(const char *name, ValkeyModuleString *val, void *privdata, ValkeyModuleString **err) { - UNUSED(name); - UNUSED(err); - struct connListener *listener = rdmaListener(); - const char *bind = ValkeyModule_StringPtrLen(val, NULL); - int nexts; - sds *exts = sdssplitlen(bind, strlen(bind), " ", 1, &nexts); - - if (nexts > CONFIG_BINDADDR_MAX) { - serverLog(LL_WARNING, "RDMA: Unsupported bind ( > %d)", CONFIG_BINDADDR_MAX); - return VALKEYMODULE_ERR; - } - - /* Free old bind addresses */ - for (int j = 0; j < listener->bindaddr_count; j++) { - zfree(listener->bindaddr[j]); - } - - for (int j = 0; j < nexts; j++) listener->bindaddr[j] = zstrdup(exts[j]); - listener->bindaddr_count = nexts; +#endif - sdsfreesplitres(exts, nexts); - rdmaBuildBind(privdata); +#if BUILD_RDMA_MODULE == 2 /* BUILD_MODULE */ - return VALKEYMODULE_OK; -} - -static int rdmaApplyListener(ValkeyModuleCtx *ctx, void *privdata, ValkeyModuleString **err) { - UNUSED(ctx); - UNUSED(privdata); - UNUSED(err); - - return rdmaChangeListener(); -} +#include "release.h" -static void rdmaListenerAddConfig(void *ctx) { - serverAssert(ValkeyModule_RegisterNumericConfig(ctx, "port", 0, VALKEYMODULE_CONFIG_DEFAULT, 0, 65535, rdmaGetPort, - rdmaSetPort, rdmaApplyListener, NULL) == VALKEYMODULE_OK); - serverAssert(ValkeyModule_RegisterStringConfig(ctx, "bind", "", VALKEYMODULE_CONFIG_DEFAULT, rdmaGetBind, - rdmaSetBind, rdmaApplyListener, ctx) == VALKEYMODULE_OK); - serverAssert(ValkeyModule_LoadConfigs(ctx) == VALKEYMODULE_OK); -} int ValkeyModule_OnLoad(void *ctx, ValkeyModuleString **argv, int argc) { + UNUSED(argv); + UNUSED(argc); + /* Connection modules MUST be part of the same build as valkey. */ if (strcmp(REDIS_BUILD_ID_RAW, serverBuildIdRaw())) { serverLog(LL_NOTICE, "Connection type %s was not built together with the valkey-server used.", CONN_TYPE_RDMA); @@ -1893,40 +1870,6 @@ int ValkeyModule_OnLoad(void *ctx, ValkeyModuleString **argv, int argc) { if (connTypeRegister(&CT_RDMA) != C_OK) return VALKEYMODULE_ERR; - rdmaListenerAddConfig(ctx); - - struct connListener *listener = rdmaListener(); - listener->ct = connectionTypeRdma(); - listener->bindaddr = zcalloc_num(CONFIG_BINDADDR_MAX, sizeof(listener->bindaddr[0])); - - for (int i = 0; i < argc; i++) { - robj *str = (robj *)argv[i]; - int nexts; - sds *exts = sdssplitlen(str->ptr, strlen(str->ptr), "=", 1, &nexts); - if (nexts != 2) { - serverLog(LL_WARNING, "RDMA: Unsupported argument \"%s\"", (char *)str->ptr); - return VALKEYMODULE_ERR; - } - - if (!strcasecmp(exts[0], "bind")) { - listener->bindaddr[listener->bindaddr_count++] = zstrdup(exts[1]); - } else if (!strcasecmp(exts[0], "port")) { - listener->port = atoi(exts[1]); - } else if (!strcasecmp(exts[0], "rx-size")) { - valkey_rdma_rx_size = atoi(exts[1]); - } else if (!strcasecmp(exts[0], "comp-vector")) { - valkey_rdma_comp_vector = atoi(exts[1]); - } else { - serverLog(LL_WARNING, "RDMA: Unsupported argument \"%s\"", (char *)str->ptr); - return VALKEYMODULE_ERR; - } - - sdsfreesplitres(exts, nexts); - } - - rdmaBuildBind(ctx); - if (valkey_rdma_comp_vector == -1) valkey_rdma_comp_vector = abs((int)random()); - return VALKEYMODULE_OK; } @@ -1938,4 +1881,11 @@ int ValkeyModule_OnUnload(void *arg) { #endif /* BUILD_RDMA_MODULE */ -#endif /* USE_RDMA && __linux__ */ +#else /* __linux__ */ + +int RegisterConnectionTypeRdma(void) { + serverLog(LL_VERBOSE, "Connection type %s is supported on Linux only", CONN_TYPE_RDMA); + return C_ERR; +} + +#endif /* __linux__ */ diff --git a/src/replication.c b/src/replication.c index 685f46c9f2..7cc881934d 100644 --- a/src/replication.c +++ b/src/replication.c @@ -227,9 +227,9 @@ void addRdbReplicaToPsyncWait(client *replica_rdb_client) { tail->refcount++; } } - serverLog(LL_DEBUG, "Add rdb replica %s to waiting psync, with cid %llu, %s ", - replicationGetReplicaName(replica_rdb_client), (unsigned long long)replica_rdb_client->id, - tail ? "tracking repl-backlog tail" : "no repl-backlog to track"); + dualChannelServerLog(LL_DEBUG, "Add rdb replica %s to waiting psync, with cid %llu, %s ", + replicationGetReplicaName(replica_rdb_client), (unsigned long long)replica_rdb_client->id, + tail ? "tracking repl-backlog tail" : "no repl-backlog to track"); replica_rdb_client->ref_repl_buf_node = tail ? ln : NULL; /* Prevent rdb client from being freed before psync is established. */ replica_rdb_client->flag.protected_rdb_channel = 1; @@ -252,8 +252,8 @@ void backfillRdbReplicasToPsyncWait(void) { if (replica_rdb_client->ref_repl_buf_node) continue; replica_rdb_client->ref_repl_buf_node = ln; head->refcount++; - serverLog(LL_DEBUG, "Attach replica rdb client %llu to repl buf block", - (long long unsigned int)replica_rdb_client->id); + dualChannelServerLog(LL_DEBUG, "Attach replica rdb client %llu to repl buf block", + (long long unsigned int)replica_rdb_client->id); } raxStop(&iter); } @@ -271,10 +271,10 @@ void removeReplicaFromPsyncWait(client *replica_main_client) { } replica_rdb_client->ref_repl_buf_node = NULL; replica_rdb_client->flag.protected_rdb_channel = 0; - serverLog(LL_DEBUG, "Remove psync waiting replica %s with cid %llu, repl buffer block %s", - replicationGetReplicaName(replica_main_client), - (long long unsigned int)replica_main_client->associated_rdb_client_id, - o ? "ref count decreased" : "doesn't exist"); + dualChannelServerLog(LL_DEBUG, "Remove psync waiting replica %s with cid %llu, repl buffer block %s", + replicationGetReplicaName(replica_main_client), + (long long unsigned int)replica_main_client->associated_rdb_client_id, + o ? "ref count decreased" : "doesn't exist"); uint64_t id = htonu64(replica_rdb_client->id); raxRemove(server.replicas_waiting_psync, (unsigned char *)&id, sizeof(id), NULL); } @@ -391,8 +391,8 @@ void freeReplicaReferencedReplBuffer(client *replica) { if (replica->flag.repl_rdb_channel) { uint64_t rdb_cid = htonu64(replica->id); if (raxRemove(server.replicas_waiting_psync, (unsigned char *)&rdb_cid, sizeof(rdb_cid), NULL)) { - serverLog(LL_DEBUG, "Remove psync waiting replica %s with cid %llu from replicas rax.", - replicationGetReplicaName(replica), (long long unsigned int)replica->id); + dualChannelServerLog(LL_DEBUG, "Remove psync waiting replica %s with cid %llu from replicas rax.", + replicationGetReplicaName(replica), (long long unsigned int)replica->id); } } if (replica->ref_repl_buf_node != NULL) { @@ -651,7 +651,7 @@ void replicationFeedStreamFromPrimaryStream(char *buf, size_t buflen) { /* Debugging: this is handy to see the stream sent from primary * to replicas. Disabled with if(0). */ if (0) { - if (server.hide_user_data_from_log) { + if (!server.hide_user_data_from_log) { printf("%zu:", buflen); for (size_t j = 0; j < buflen; j++) { printf("%c", isprint(buf[j]) ? buf[j] : '.'); @@ -1051,7 +1051,7 @@ void syncCommand(client *c) { } else { replicationUnsetPrimary(); } - sds client = catClientInfoString(sdsempty(), c, server.hide_user_data_from_log); + sds client = catClientInfoShortString(sdsempty(), c, server.hide_user_data_from_log); serverLog(LL_NOTICE, "PRIMARY MODE enabled (failover request from '%s')", client); sdsfree(client); } else { @@ -1121,10 +1121,11 @@ void syncCommand(client *c) { * resync. */ if (primary_replid[0] != '?') server.stat_sync_partial_err++; if (c->replica_capa & REPLICA_CAPA_DUAL_CHANNEL) { - serverLog(LL_NOTICE, - "Replica %s is capable of dual channel synchronization, and partial sync isn't possible. " - "Full sync will continue with dedicated RDB channel.", - replicationGetReplicaName(c)); + dualChannelServerLog(LL_NOTICE, + "Replica %s is capable of dual channel synchronization, and partial sync " + "isn't possible. " + "Full sync will continue with dedicated RDB channel.", + replicationGetReplicaName(c)); const char *buf = "+DUALCHANNELSYNC\r\n"; if (connWrite(c->conn, buf, strlen(buf)) != (int)strlen(buf)) { freeClientAsync(c); @@ -1669,7 +1670,9 @@ void rdbPipeReadHandler(struct aeEventLoop *eventLoop, int fd, void *clientData, if (!conn) continue; stillUp++; } - serverLog(LL_NOTICE, "Diskless rdb transfer, done reading from pipe, %d replicas still up.", stillUp); + if (stillUp) { + serverLog(LL_NOTICE, "Diskless rdb transfer, done reading from pipe, %d replicas still up.", stillUp); + } /* Now that the replicas have finished reading, notify the child that it's safe to exit. * When the server detects the child has exited, it can mark the replica as online, and * start streaming the replication buffers. */ @@ -1678,7 +1681,6 @@ void rdbPipeReadHandler(struct aeEventLoop *eventLoop, int fd, void *clientData, return; } - int stillAlive = 0; for (i = 0; i < server.rdb_pipe_numconns; i++) { ssize_t nwritten; connection *conn = server.rdb_pipe_conns[i]; @@ -1708,15 +1710,10 @@ void rdbPipeReadHandler(struct aeEventLoop *eventLoop, int fd, void *clientData, server.rdb_pipe_numconns_writing++; connSetWriteHandler(conn, rdbPipeWriteHandler); } - stillAlive++; } - if (stillAlive == 0) { - serverLog(LL_WARNING, "Diskless rdb transfer, last replica dropped, killing fork child."); - killRDBChild(); - } /* Remove the pipe read handler if at least one write handler was set. */ - if (server.rdb_pipe_numconns_writing || stillAlive == 0) { + if (server.rdb_pipe_numconns_writing) { aeDeleteFileEvent(server.el, server.rdb_pipe_read, AE_READABLE); break; } @@ -1745,6 +1742,8 @@ void updateReplicasWaitingBgsave(int bgsaveerr, int type) { struct valkey_stat buf; if (bgsaveerr != C_OK) { + /* If bgsaveerr is error, there is no need to protect the rdb channel. */ + replica->flag.protected_rdb_channel = 0; freeClientAsync(replica); serverLog(LL_WARNING, "SYNC failed. BGSAVE child returned an error"); continue; @@ -1983,7 +1982,20 @@ serverDb *disklessLoadInitTempDb(void) { /* Helper function for readSyncBulkPayload() to discard our tempDb * when the loading succeeded or failed. */ void disklessLoadDiscardTempDb(serverDb *tempDb) { - discardTempDb(tempDb, replicationEmptyDbCallback); + discardTempDb(tempDb); +} + +/* Helper function for to initialize temp function lib context. + * The temp ctx may be populated by functionsLibCtxSwapWithCurrent or + * freed by disklessLoadDiscardFunctionsLibCtx later. */ +functionsLibCtx *disklessLoadFunctionsLibCtxCreate(void) { + return functionsLibCtxCreate(); +} + +/* Helper function to discard our temp function lib context + * when the loading succeeded or failed. */ +void disklessLoadDiscardFunctionsLibCtx(functionsLibCtx *temp_functions_lib_ctx) { + freeFunctionsAsync(temp_functions_lib_ctx); } /* If we know we got an entirely different data set from our primary @@ -2093,8 +2105,7 @@ void readSyncBulkPayload(connection *conn) { } serverLog(LL_WARNING, "I/O error trying to sync with PRIMARY: %s", (nread == -1) ? connGetLastError(conn) : "connection lost"); - cancelReplicationHandshake(1); - return; + goto error; } server.stat_net_repl_input_bytes += nread; @@ -2189,7 +2200,7 @@ void readSyncBulkPayload(connection *conn) { if (use_diskless_load && server.repl_diskless_load == REPL_DISKLESS_LOAD_SWAPDB) { /* Initialize empty tempDb dictionaries. */ diskless_load_tempDb = disklessLoadInitTempDb(); - temp_functions_lib_ctx = functionsLibCtxCreate(); + temp_functions_lib_ctx = disklessLoadFunctionsLibCtxCreate(); moduleFireServerEvent(VALKEYMODULE_EVENT_REPL_ASYNC_LOAD, VALKEYMODULE_SUBEVENT_REPL_ASYNC_LOAD_STARTED, NULL); } @@ -2229,7 +2240,6 @@ void readSyncBulkPayload(connection *conn) { dbarray = server.db; functions_lib_ctx = functionsLibCtxGetCurrent(); - functionsLibCtxClear(functions_lib_ctx); } rioInitWithConn(&rdb, conn, server.repl_transfer_size); @@ -2259,7 +2269,6 @@ void readSyncBulkPayload(connection *conn) { if (loadingFailed) { stopLoading(0); - cancelReplicationHandshake(1); rioFreeConn(&rdb, NULL); if (server.repl_diskless_load == REPL_DISKLESS_LOAD_SWAPDB) { @@ -2268,7 +2277,7 @@ void readSyncBulkPayload(connection *conn) { NULL); disklessLoadDiscardTempDb(diskless_load_tempDb); - functionsLibCtxFree(temp_functions_lib_ctx); + disklessLoadDiscardFunctionsLibCtx(temp_functions_lib_ctx); serverLog(LL_NOTICE, "PRIMARY <-> REPLICA sync: Discarding temporary DB in background"); } else { /* Remove the half-loaded data in case we started with an empty replica. */ @@ -2279,7 +2288,7 @@ void readSyncBulkPayload(connection *conn) { /* Note that there's no point in restarting the AOF on SYNC * failure, it'll be restarted when sync succeeds or the replica * gets promoted. */ - return; + goto error; } /* RDB loading succeeded if we reach this point. */ @@ -2293,7 +2302,7 @@ void readSyncBulkPayload(connection *conn) { swapMainDbWithTempDb(diskless_load_tempDb); /* swap existing functions ctx with the temporary one */ - functionsLibCtxSwapWithCurrent(temp_functions_lib_ctx); + functionsLibCtxSwapWithCurrent(temp_functions_lib_ctx, 1); moduleFireServerEvent(VALKEYMODULE_EVENT_REPL_ASYNC_LOAD, VALKEYMODULE_SUBEVENT_REPL_ASYNC_LOAD_COMPLETED, NULL); @@ -2321,8 +2330,7 @@ void readSyncBulkPayload(connection *conn) { "Failed trying to sync the temp DB to disk in " "PRIMARY <-> REPLICA synchronization: %s", strerror(errno)); - cancelReplicationHandshake(1); - return; + goto error; } /* Rename rdb like renaming rewrite aof asynchronously. */ @@ -2332,9 +2340,8 @@ void readSyncBulkPayload(connection *conn) { "Failed trying to rename the temp DB into %s in " "PRIMARY <-> REPLICA synchronization: %s", server.rdb_filename, strerror(errno)); - cancelReplicationHandshake(1); if (old_rdb_fd != -1) close(old_rdb_fd); - return; + goto error; } /* Close old rdb asynchronously. */ if (old_rdb_fd != -1) bioCreateCloseJob(old_rdb_fd, 0, 0); @@ -2345,8 +2352,7 @@ void readSyncBulkPayload(connection *conn) { "Failed trying to sync DB directory %s in " "PRIMARY <-> REPLICA synchronization: %s", server.rdb_filename, strerror(errno)); - cancelReplicationHandshake(1); - return; + goto error; } /* We will soon start loading the RDB from disk, the replication history is changed, @@ -2363,7 +2369,6 @@ void readSyncBulkPayload(connection *conn) { if (rdbLoad(server.rdb_filename, &rsi, RDBFLAGS_REPLICATION) != RDB_OK) { serverLog(LL_WARNING, "Failed trying to load the PRIMARY synchronization " "DB from disk, check server logs."); - cancelReplicationHandshake(1); if (server.rdb_del_sync_files && allPersistenceDisabled()) { serverLog(LL_NOTICE, "Removing the RDB file obtained from " "the primary. This replica has persistence " @@ -2377,7 +2382,7 @@ void readSyncBulkPayload(connection *conn) { /* Note that there's no point in restarting the AOF on sync failure, it'll be restarted when sync succeeds or replica promoted. */ - return; + goto error; } /* Cleanup. */ @@ -2400,10 +2405,10 @@ void readSyncBulkPayload(connection *conn) { } else { replicationCreatePrimaryClient(server.repl_transfer_s, rsi.repl_stream_db); server.repl_state = REPL_STATE_CONNECTED; + server.repl_down_since = 0; /* Send the initial ACK immediately to put this replica in online state. */ replicationSendAck(); } - server.repl_down_since = 0; /* Fire the primary link modules event. */ moduleFireServerEvent(VALKEYMODULE_EVENT_PRIMARY_LINK_CHANGE, VALKEYMODULE_SUBEVENT_PRIMARY_LINK_UP, NULL); @@ -2561,7 +2566,7 @@ void freePendingReplDataBuf(void) { * provisional primary struct, and free local replication buffer. */ void replicationAbortDualChannelSyncTransfer(void) { serverAssert(server.repl_rdb_channel_state != REPL_DUAL_CHANNEL_STATE_NONE); - serverLog(LL_NOTICE, "Aborting dual channel sync"); + dualChannelServerLog(LL_NOTICE, "Aborting dual channel sync"); if (server.repl_rdb_transfer_s) { connClose(server.repl_rdb_transfer_s); server.repl_rdb_transfer_s = NULL; @@ -2590,8 +2595,9 @@ int sendCurrentOffsetToReplica(client *replica) { int buflen; buflen = snprintf(buf, sizeof(buf), "$ENDOFF:%lld %s %d %llu\r\n", server.primary_repl_offset, server.replid, server.db->id, (long long unsigned int)replica->id); - serverLog(LL_NOTICE, "Sending to replica %s RDB end offset %lld and client-id %llu", - replicationGetReplicaName(replica), server.primary_repl_offset, (long long unsigned int)replica->id); + dualChannelServerLog(LL_NOTICE, "Sending to replica %s RDB end offset %lld and client-id %llu", + replicationGetReplicaName(replica), server.primary_repl_offset, + (long long unsigned int)replica->id); if (connSyncWrite(replica->conn, buf, buflen, server.repl_syncio_timeout * 1000) != buflen) { freeClientAsync(replica); return C_ERR; @@ -2600,7 +2606,7 @@ int sendCurrentOffsetToReplica(client *replica) { } static int dualChannelReplHandleHandshake(connection *conn, sds *err) { - serverLog(LL_DEBUG, "Received first reply from primary using rdb connection."); + dualChannelServerLog(LL_DEBUG, "Received first reply from primary using rdb connection."); /* AUTH with the primary if required. */ if (server.primary_auth) { char *args[] = {"AUTH", NULL, NULL}; @@ -2616,7 +2622,7 @@ static int dualChannelReplHandleHandshake(connection *conn, sds *err) { argc++; *err = sendCommandArgv(conn, argc, args, lens); if (*err) { - serverLog(LL_WARNING, "Sending command to primary in dual channel replication handshake: %s", *err); + dualChannelServerLog(LL_WARNING, "Sending command to primary in dual channel replication handshake: %s", *err); return C_ERR; } } @@ -2626,14 +2632,14 @@ static int dualChannelReplHandleHandshake(connection *conn, sds *err) { NULL); sdsfree(portstr); if (*err) { - serverLog(LL_WARNING, "Sending command to primary in dual channel replication handshake: %s", *err); + dualChannelServerLog(LL_WARNING, "Sending command to primary in dual channel replication handshake: %s", *err); return C_ERR; } if (connSetReadHandler(conn, dualChannelFullSyncWithPrimary) == C_ERR) { char conninfo[CONN_INFO_LEN]; - serverLog(LL_WARNING, "Can't create readable event for SYNC: %s (%s)", strerror(errno), - connGetInfo(conn, conninfo, sizeof(conninfo))); + dualChannelServerLog(LL_WARNING, "Can't create readable event for SYNC: %s (%s)", strerror(errno), + connGetInfo(conn, conninfo, sizeof(conninfo))); return C_ERR; } return C_OK; @@ -2642,11 +2648,11 @@ static int dualChannelReplHandleHandshake(connection *conn, sds *err) { static int dualChannelReplHandleAuthReply(connection *conn, sds *err) { *err = receiveSynchronousResponse(conn); if (*err == NULL) { - serverLog(LL_WARNING, "Primary did not respond to auth command during SYNC handshake"); + dualChannelServerLog(LL_WARNING, "Primary did not respond to auth command during SYNC handshake"); return C_ERR; } if ((*err)[0] == '-') { - serverLog(LL_WARNING, "Unable to AUTH to Primary: %s", *err); + dualChannelServerLog(LL_WARNING, "Unable to AUTH to Primary: %s", *err); return C_ERR; } server.repl_rdb_channel_state = REPL_DUAL_CHANNEL_RECEIVE_REPLCONF_REPLY; @@ -2656,17 +2662,17 @@ static int dualChannelReplHandleAuthReply(connection *conn, sds *err) { static int dualChannelReplHandleReplconfReply(connection *conn, sds *err) { *err = receiveSynchronousResponse(conn); if (*err == NULL) { - serverLog(LL_WARNING, "Primary did not respond to replconf command during SYNC handshake"); + dualChannelServerLog(LL_WARNING, "Primary did not respond to replconf command during SYNC handshake"); return C_ERR; } if (*err[0] == '-') { - serverLog(LL_NOTICE, "Server does not support sync with offset, dual channel sync approach cannot be used: %s", - *err); + dualChannelServerLog(LL_NOTICE, "Server does not support sync with offset, dual channel sync approach cannot be used: %s", + *err); return C_ERR; } if (connSyncWrite(conn, "SYNC\r\n", 6, server.repl_syncio_timeout * 1000) == -1) { - serverLog(LL_WARNING, "I/O error writing to Primary: %s", connGetLastError(conn)); + dualChannelServerLog(LL_WARNING, "I/O error writing to Primary: %s", connGetLastError(conn)); return C_ERR; } return C_OK; @@ -2680,7 +2686,7 @@ static int dualChannelReplHandleEndOffsetResponse(connection *conn, sds *err) { } if (*err[0] == '\0') { /* Retry again later */ - serverLog(LL_DEBUG, "Received empty $ENDOFF response"); + dualChannelServerLog(LL_DEBUG, "Received empty $ENDOFF response"); return C_RETRY; } long long reploffset; @@ -2689,7 +2695,7 @@ static int dualChannelReplHandleEndOffsetResponse(connection *conn, sds *err) { /* Parse end offset response */ char *endoff_format = "$ENDOFF:%lld %40s %d %llu"; if (sscanf(*err, endoff_format, &reploffset, primary_replid, &dbid, &rdb_client_id) != 4) { - serverLog(LL_WARNING, "Received unexpected $ENDOFF response: %s", *err); + dualChannelServerLog(LL_WARNING, "Received unexpected $ENDOFF response: %s", *err); return C_ERR; } server.rdb_client_id = rdb_client_id; @@ -2697,7 +2703,7 @@ static int dualChannelReplHandleEndOffsetResponse(connection *conn, sds *err) { /* Initiate repl_provisional_primary to act as this replica temp primary until RDB is loaded */ server.repl_provisional_primary.conn = server.repl_transfer_s; - memcpy(server.repl_provisional_primary.replid, primary_replid, CONFIG_RUN_ID_SIZE); + memcpy(server.repl_provisional_primary.replid, primary_replid, sizeof(server.repl_provisional_primary.replid)); server.repl_provisional_primary.reploff = reploffset; server.repl_provisional_primary.read_reploff = reploffset; server.repl_provisional_primary.dbid = dbid; @@ -2737,7 +2743,8 @@ static void dualChannelFullSyncWithPrimary(connection *conn) { /* Check for errors in the socket: after a non blocking connect() we * may find that the socket is in error state. */ if (connGetState(conn) != CONN_STATE_CONNECTED) { - serverLog(LL_WARNING, "Error condition on socket for dual channel replication: %s", connGetLastError(conn)); + dualChannelServerLog(LL_WARNING, "Error condition on socket for dual channel replication: %s", + connGetLastError(conn)); goto error; } switch (server.repl_rdb_channel_state) { @@ -2824,10 +2831,9 @@ typedef struct replDataBufBlock { * Reads replication data from primary into specified repl buffer block */ int readIntoReplDataBlock(connection *conn, replDataBufBlock *data_block, size_t read) { int nread = connRead(conn, data_block->buf + data_block->used, read); - if (nread <= 0) { if (nread == 0 || connGetState(conn) != CONN_STATE_CONNECTED) { - serverLog(LL_WARNING, "Provisional primary closed connection"); + dualChannelServerLog(LL_WARNING, "Provisional primary closed connection"); /* Signal ongoing RDB load to terminate gracefully */ if (server.loading_rio) rioCloseASAP(server.loading_rio); cancelReplicationHandshake(1); @@ -2859,7 +2865,7 @@ void bufferReplData(connection *conn) { if (readlen && remaining_bytes == 0) { if (server.client_obuf_limits[CLIENT_TYPE_REPLICA].hard_limit_bytes && server.pending_repl_data.len > server.client_obuf_limits[CLIENT_TYPE_REPLICA].hard_limit_bytes) { - serverLog(LL_NOTICE, "Replication buffer limit reached, stopping buffering."); + dualChannelServerLog(LL_NOTICE, "Replication buffer limit reached, stopping buffering."); /* Stop accumulating primary commands. */ connSetReadHandler(conn, NULL); break; @@ -2932,7 +2938,7 @@ void dualChannelSyncSuccess(void) { /* Wait for the accumulated buffer to be processed before reading any more replication updates */ if (server.pending_repl_data.blocks && streamReplDataBufToDb(server.primary) == C_ERR) { /* Sync session aborted during repl data streaming. */ - serverLog(LL_WARNING, "Failed to stream local replication buffer into memory"); + dualChannelServerLog(LL_WARNING, "Failed to stream local replication buffer into memory"); /* Verify sync is still in progress */ if (server.repl_rdb_channel_state != REPL_DUAL_CHANNEL_STATE_NONE) { replicationAbortDualChannelSyncTransfer(); @@ -2941,7 +2947,7 @@ void dualChannelSyncSuccess(void) { return; } freePendingReplDataBuf(); - serverLog(LL_NOTICE, "Successfully streamed replication data into memory"); + dualChannelServerLog(LL_NOTICE, "Successfully streamed replication data into memory"); /* We can resume reading from the primary connection once the local replication buffer has been loaded. */ replicationSteadyStateInit(); replicationSendAck(); /* Send ACK to notify primary that replica is synced */ @@ -2957,7 +2963,7 @@ int dualChannelSyncHandlePsync(void) { if (server.repl_rdb_channel_state < REPL_DUAL_CHANNEL_RDB_LOADED) { /* RDB is still loading */ if (connSetReadHandler(server.repl_provisional_primary.conn, bufferReplData) == C_ERR) { - serverLog(LL_WARNING, "Error while setting readable handler: %s", strerror(errno)); + dualChannelServerLog(LL_WARNING, "Error while setting readable handler: %s", strerror(errno)); cancelReplicationHandshake(1); return C_ERR; } @@ -2966,7 +2972,7 @@ int dualChannelSyncHandlePsync(void) { } serverAssert(server.repl_rdb_channel_state == REPL_DUAL_CHANNEL_RDB_LOADED); /* RDB is loaded */ - serverLog(LL_DEBUG, "Dual channel sync - psync established after rdb load"); + dualChannelServerLog(LL_DEBUG, "Psync established after rdb load"); dualChannelSyncSuccess(); return C_OK; } @@ -3060,8 +3066,9 @@ int replicaTryPartialResynchronization(connection *conn, int read_reply) { /* While in dual channel replication, we should use our prepared repl id and offset. */ psync_replid = server.repl_provisional_primary.replid; snprintf(psync_offset, sizeof(psync_offset), "%lld", server.repl_provisional_primary.reploff + 1); - serverLog(LL_NOTICE, "Trying a partial resynchronization using main channel (request %s:%s).", psync_replid, - psync_offset); + dualChannelServerLog(LL_NOTICE, + "Trying a partial resynchronization using main channel (request %s:%s).", + psync_replid, psync_offset); } else if (server.cached_primary) { psync_replid = server.cached_primary->replid; snprintf(psync_offset, sizeof(psync_offset), "%lld", server.cached_primary->reploff + 1); @@ -3208,7 +3215,7 @@ int replicaTryPartialResynchronization(connection *conn, int read_reply) { /* A response of +DUALCHANNELSYNC from the primary implies that partial * synchronization is not possible and that the primary supports full * sync using dedicated RDB channel. Full sync will continue that way. */ - serverLog(LL_NOTICE, "PSYNC is not possible, initialize RDB channel."); + dualChannelServerLog(LL_NOTICE, "PSYNC is not possible, initialize RDB channel."); sdsfree(reply); return PSYNC_FULLRESYNC_DUAL_CHANNEL; } @@ -3245,7 +3252,6 @@ int dualChannelReplMainConnSendHandshake(connection *conn, sds *err) { ull2string(llstr, sizeof(llstr), server.rdb_client_id); *err = sendCommand(conn, "REPLCONF", "set-rdb-client-id", llstr, NULL); if (*err) return C_ERR; - server.repl_state = REPL_STATE_RECEIVE_CAPA_REPLY; return C_OK; } @@ -3253,21 +3259,19 @@ int dualChannelReplMainConnRecvCapaReply(connection *conn, sds *err) { *err = receiveSynchronousResponse(conn); if (*err == NULL) return C_ERR; if ((*err)[0] == '-') { - serverLog(LL_NOTICE, "Primary does not understand REPLCONF identify: %s", *err); + dualChannelServerLog(LL_NOTICE, "Primary does not understand REPLCONF identify: %s", *err); return C_ERR; } - server.repl_state = REPL_STATE_SEND_PSYNC; return C_OK; } int dualChannelReplMainConnSendPsync(connection *conn, sds *err) { if (server.debug_pause_after_fork) debugPauseProcess(); if (replicaTryPartialResynchronization(conn, 0) == PSYNC_WRITE_ERROR) { - serverLog(LL_WARNING, "Aborting dual channel sync. Write error."); + dualChannelServerLog(LL_WARNING, "Aborting dual channel sync. Write error."); *err = sdsnew(connGetLastError(conn)); return C_ERR; } - server.repl_state = REPL_STATE_RECEIVE_PSYNC_REPLY; return C_OK; } @@ -3276,8 +3280,8 @@ int dualChannelReplMainConnRecvPsyncReply(connection *conn, sds *err) { if (psync_result == PSYNC_WAIT_REPLY) return C_OK; /* Try again later... */ if (psync_result == PSYNC_CONTINUE) { - serverLog(LL_NOTICE, "PRIMARY <-> REPLICA sync: Primary accepted a Partial Resynchronization%s", - server.repl_rdb_transfer_s != NULL ? ", RDB load in background." : "."); + dualChannelServerLog(LL_NOTICE, "PRIMARY <-> REPLICA sync: Primary accepted a Partial Resynchronization%s", + server.repl_rdb_transfer_s != NULL ? ", RDB load in background." : "."); if (server.supervised_mode == SUPERVISED_SYSTEMD) { serverCommunicateSystemd("STATUS=PRIMARY <-> REPLICA sync: Partial Resynchronization accepted. Ready to " "accept connections in read-write mode.\n"); @@ -3325,7 +3329,7 @@ void dualChannelSetupMainConnForPsync(connection *conn) { } if (ret == C_ERR) { - serverLog(LL_WARNING, "Aborting dual channel sync. Main channel psync result %d %s", ret, err ? err : ""); + dualChannelServerLog(LL_WARNING, "Aborting dual channel sync. Main channel psync result %d %s", ret, err ? err : ""); cancelReplicationHandshake(1); } sdsfree(err); @@ -3382,15 +3386,15 @@ void dualChannelSetupMainConnForPsync(connection *conn) { * ┌────────▼──────────┐ │ │ │DUAL_CHANNEL_RECEIVE_ENDOFF│ │ │by the primary │ * │RECEIVE_IP_REPLY │ │ │ └───────┬───────────────────┘ │ ┌──▼────────────────┐ │ * └────────┬──────────┘ │ │ │$ENDOFF │ │RECEIVE_PSYNC_REPLY│ │ - * │ │ │ ├─────────────────────────┘ └──┬────────────────┘ │ - * │ │ │ │ │+CONTINUE │ - * │ │ │ ┌───────▼───────────────┐ ┌──▼────────────────┐ │ - * │ │ │ │DUAL_CHANNEL_RDB_LOAD │ │TRANSFER │ │ + * │+OK │ │ ├─────────────────────────┘ └──┬────────────────┘ │ + * ┌────────▼──────────┐ │ │ │ │+CONTINUE │ + * │RECEIVE_CAPA_REPLY │ │ │ ┌───────▼───────────────┐ ┌──▼────────────────┐ │ + * └────────┬──────────┘ │ │ │DUAL_CHANNEL_RDB_LOAD │ │TRANSFER │ │ * │+OK │ │ └───────┬───────────────┘ └─────┬─────────────┘ │ - * ┌────────▼──────────┐ │ │ │Done loading │ │ - * │RECEIVE_CAPA_REPLY │ │ │ ┌───────▼───────────────┐ │ │ - * └────────┬──────────┘ │ │ │DUAL_CHANNEL_RDB_LOADED│ │ │ - * │ │ │ └───────┬───────────────┘ │ │ + * ┌────────▼─────────────┐ │ │ │Done loading │ │ + * │RECEIVE_VERSION_REPLY │ │ │ ┌───────▼───────────────┐ │ │ + * └────────┬─────────────┘ │ │ │DUAL_CHANNEL_RDB_LOADED│ │ │ + * │+OK │ │ └───────┬───────────────┘ │ │ * ┌────────▼───┐ │ │ │ │ │ * │SEND_PSYNC │ │ │ │Replica loads local replication │ │ * └─┬──────────┘ │ │ │buffer into memory │ │ @@ -3411,7 +3415,6 @@ void dualChannelSetupMainConnForPsync(connection *conn) { * establish a connection with the primary. */ void syncWithPrimary(connection *conn) { char tmpfile[256], *err = NULL; - int dfd = -1, maxtries = 5; int psync_result; /* If this event fired after the user turned the instance into a primary @@ -3592,6 +3595,7 @@ void syncWithPrimary(connection *conn) { sdsfree(err); err = NULL; server.repl_state = REPL_STATE_RECEIVE_VERSION_REPLY; + return; } /* Receive VERSION reply. */ @@ -3680,11 +3684,16 @@ void syncWithPrimary(connection *conn) { /* Prepare a suitable temp file for bulk transfer */ if (!useDisklessLoad()) { + int dfd = -1, maxtries = 5; while (maxtries--) { snprintf(tmpfile, 256, "temp-%d.%ld.rdb", (int)server.unixtime, (long int)getpid()); dfd = open(tmpfile, O_CREAT | O_WRONLY | O_EXCL, 0644); if (dfd != -1) break; + /* We save the errno of open to prevent some systems from modifying it after + * the sleep call. For example, sleep in Mac will change errno to ETIMEDOUT. */ + int saved_errno = errno; sleep(1); + errno = saved_errno; } if (dfd == -1) { serverLog(LL_WARNING, "Opening the temp file needed for PRIMARY <-> REPLICA synchronization: %s", @@ -3709,8 +3718,8 @@ void syncWithPrimary(connection *conn) { } if (connSetReadHandler(conn, NULL) == C_ERR) { char conninfo[CONN_INFO_LEN]; - serverLog(LL_WARNING, "Can't clear main connection handler: %s (%s)", strerror(errno), - connGetInfo(conn, conninfo, sizeof(conninfo))); + dualChannelServerLog(LL_WARNING, "Can't clear main connection handler: %s (%s)", strerror(errno), + connGetInfo(conn, conninfo, sizeof(conninfo))); goto error; } server.repl_rdb_channel_state = REPL_DUAL_CHANNEL_SEND_HANDSHAKE; @@ -3736,7 +3745,6 @@ void syncWithPrimary(connection *conn) { /* Fall through to regular error handling */ error: - if (dfd != -1) close(dfd); connClose(conn); server.repl_transfer_s = NULL; if (server.repl_rdb_transfer_s) { @@ -3979,7 +3987,7 @@ void replicaofCommand(client *c) { if (!strcasecmp(c->argv[1]->ptr, "no") && !strcasecmp(c->argv[2]->ptr, "one")) { if (server.primary_host) { replicationUnsetPrimary(); - sds client = catClientInfoString(sdsempty(), c, server.hide_user_data_from_log); + sds client = catClientInfoShortString(sdsempty(), c, server.hide_user_data_from_log); serverLog(LL_NOTICE, "PRIMARY MODE enabled (user request from '%s')", client); sdsfree(client); } @@ -4008,7 +4016,7 @@ void replicaofCommand(client *c) { /* There was no previous primary or the user specified a different one, * we can continue. */ replicationSetPrimary(c->argv[1]->ptr, port, 0); - sds client = catClientInfoString(sdsempty(), c, server.hide_user_data_from_log); + sds client = catClientInfoShortString(sdsempty(), c, server.hide_user_data_from_log); serverLog(LL_NOTICE, "REPLICAOF %s:%d enabled (user request from '%s')", server.primary_host, server.primary_port, client); sdsfree(client); @@ -4270,7 +4278,7 @@ void replicationResurrectProvisionalPrimary(void) { /* Create a primary client, but do not initialize the read handler yet, as this replica still has a local buffer to * drain. */ replicationCreatePrimaryClientWithHandler(server.repl_transfer_s, server.repl_provisional_primary.dbid, NULL); - memcpy(server.primary->replid, server.repl_provisional_primary.replid, CONFIG_RUN_ID_SIZE); + memcpy(server.primary->replid, server.repl_provisional_primary.replid, sizeof(server.repl_provisional_primary.replid)); server.primary->reploff = server.repl_provisional_primary.reploff; server.primary->read_reploff = server.repl_provisional_primary.read_reploff; server.primary_repl_offset = server.primary->reploff; diff --git a/src/resp_parser.c b/src/resp_parser.c index 950d2227b7..101e883d2f 100644 --- a/src/resp_parser.c +++ b/src/resp_parser.c @@ -58,6 +58,8 @@ #include "resp_parser.h" #include "server.h" +#include "valkey_strtod.h" + static int parseBulk(ReplyParser *parser, void *p_ctx) { const char *proto = parser->curr_location; char *p = strchr(proto + 1, '\r'); @@ -150,13 +152,11 @@ static int parseDouble(ReplyParser *parser, void *p_ctx) { parser->curr_location = p + 2; /* for \r\n */ char buf[MAX_LONG_DOUBLE_CHARS + 1]; size_t len = p - proto - 1; - double d; + double d = 0; if (len <= MAX_LONG_DOUBLE_CHARS) { memcpy(buf, proto + 1, len); buf[len] = '\0'; - d = strtod(buf, NULL); /* We expect a valid representation. */ - } else { - d = 0; + d = valkey_strtod(buf, NULL); /* We expect a valid representation. */ } parser->callbacks.double_callback(p_ctx, d, proto, parser->curr_location - proto); return C_OK; diff --git a/src/sds.c b/src/sds.c index e14f4bd0bd..ee7a2c0f97 100644 --- a/src/sds.c +++ b/src/sds.c @@ -954,23 +954,30 @@ void sdsfreesplitres(sds *tokens, int count) { sds sdscatrepr(sds s, const char *p, size_t len) { s = sdsMakeRoomFor(s, len + 2); s = sdscatlen(s, "\"", 1); - while (len--) { - switch (*p) { - case '\\': - case '"': s = sdscatprintf(s, "\\%c", *p); break; - case '\n': s = sdscatlen(s, "\\n", 2); break; - case '\r': s = sdscatlen(s, "\\r", 2); break; - case '\t': s = sdscatlen(s, "\\t", 2); break; - case '\a': s = sdscatlen(s, "\\a", 2); break; - case '\b': s = sdscatlen(s, "\\b", 2); break; - default: - if (isprint(*p)) - s = sdscatlen(s, p, 1); - else + while (len) { + if (isprint(*p)) { + const char *start = p; + while (len && isprint(*p)) { + len--; + p++; + } + s = sdscatlen(s, start, p - start); + } else { + switch (*p) { + case '\\': + case '"': s = sdscatprintf(s, "\\%c", *p); break; + case '\n': s = sdscatlen(s, "\\n", 2); break; + case '\r': s = sdscatlen(s, "\\r", 2); break; + case '\t': s = sdscatlen(s, "\\t", 2); break; + case '\a': s = sdscatlen(s, "\\a", 2); break; + case '\b': s = sdscatlen(s, "\\b", 2); break; + default: s = sdscatprintf(s, "\\x%02x", (unsigned char)*p); - break; + break; + } + p++; + len--; } - p++; } return sdscatlen(s, "\"", 1); } @@ -1032,6 +1039,86 @@ int hex_digit_to_int(char c) { } } +/* Helper function for sdssplitargs that parses a single argument. It + * populates the number characters needed to store the parsed argument + * in len, if provided, or will copy the parsed string into dst, if provided. + * If the string is able to be parsed, this function returns the number of + * characters that were parsed. If the argument can't be parsed, it + * returns 0. */ +static int sdsparsearg(const char *arg, unsigned int *len, char *dst) { + const char *p = arg; + int inq = 0; /* set to 1 if we are in "quotes" */ + int insq = 0; /* set to 1 if we are in 'single quotes' */ + int done = 0; + + while (!done) { + int new_char = -1; + if (inq) { + if (*p == '\\' && *(p + 1) == 'x' && is_hex_digit(*(p + 2)) && is_hex_digit(*(p + 3))) { + new_char = (hex_digit_to_int(*(p + 2)) * 16) + hex_digit_to_int(*(p + 3)); + p += 3; + } else if (*p == '\\' && *(p + 1)) { + p++; + switch (*p) { + case 'n': new_char = '\n'; break; + case 'r': new_char = '\r'; break; + case 't': new_char = '\t'; break; + case 'b': new_char = '\b'; break; + case 'a': new_char = '\a'; break; + default: new_char = *p; break; + } + } else if (*p == '"') { + /* closing quote must be followed by a space or + * nothing at all. */ + if (*(p + 1) && !isspace(*(p + 1))) return 0; + done = 1; + } else if (!*p) { + /* unterminated quotes */ + return 0; + } else { + new_char = *p; + } + } else if (insq) { + if (*p == '\\' && *(p + 1) == '\'') { + p++; + new_char = *p; + } else if (*p == '\'') { + /* closing quote must be followed by a space or + * nothing at all. */ + if (*(p + 1) && !isspace(*(p + 1))) return 0; + done = 1; + } else if (!*p) { + /* unterminated quotes */ + return 0; + } else { + new_char = *p; + } + } else { + switch (*p) { + case ' ': + case '\n': + case '\r': + case '\t': + case '\0': done = 1; break; + case '"': inq = 1; break; + case '\'': insq = 1; break; + default: new_char = *p; break; + } + } + if (new_char != -1) { + if (len) (*len)++; + if (dst) { + *dst = (char)new_char; + dst++; + } + } + if (*p) { + p++; + } + } + return p - arg; +} + /* Split a line into arguments, where every argument can be in the * following programming-language REPL-alike form: * @@ -1049,103 +1136,42 @@ int hex_digit_to_int(char c) { * The function returns the allocated tokens on success, even when the * input string is empty, or NULL if the input contains unbalanced * quotes or closed quotes followed by non space characters - * as in: "foo"bar or "foo' + * as in: "foo"bar or "foo'. + * + * The sds strings returned by this function are not initialized with + * extra space. */ sds *sdssplitargs(const char *line, int *argc) { const char *p = line; - char *current = NULL; char **vector = NULL; *argc = 0; - while (1) { + while (*p) { /* skip blanks */ while (*p && isspace(*p)) p++; - if (*p) { - /* get a token */ - int inq = 0; /* set to 1 if we are in "quotes" */ - int insq = 0; /* set to 1 if we are in 'single quotes' */ - int done = 0; - - if (current == NULL) current = sdsempty(); - while (!done) { - if (inq) { - if (*p == '\\' && *(p + 1) == 'x' && is_hex_digit(*(p + 2)) && is_hex_digit(*(p + 3))) { - unsigned char byte; - - byte = (hex_digit_to_int(*(p + 2)) * 16) + hex_digit_to_int(*(p + 3)); - current = sdscatlen(current, (char *)&byte, 1); - p += 3; - } else if (*p == '\\' && *(p + 1)) { - char c; - - p++; - switch (*p) { - case 'n': c = '\n'; break; - case 'r': c = '\r'; break; - case 't': c = '\t'; break; - case 'b': c = '\b'; break; - case 'a': c = '\a'; break; - default: c = *p; break; - } - current = sdscatlen(current, &c, 1); - } else if (*p == '"') { - /* closing quote must be followed by a space or - * nothing at all. */ - if (*(p + 1) && !isspace(*(p + 1))) goto err; - done = 1; - } else if (!*p) { - /* unterminated quotes */ - goto err; - } else { - current = sdscatlen(current, p, 1); - } - } else if (insq) { - if (*p == '\\' && *(p + 1) == '\'') { - p++; - current = sdscatlen(current, "'", 1); - } else if (*p == '\'') { - /* closing quote must be followed by a space or - * nothing at all. */ - if (*(p + 1) && !isspace(*(p + 1))) goto err; - done = 1; - } else if (!*p) { - /* unterminated quotes */ - goto err; - } else { - current = sdscatlen(current, p, 1); - } - } else { - switch (*p) { - case ' ': - case '\n': - case '\r': - case '\t': - case '\0': done = 1; break; - case '"': inq = 1; break; - case '\'': insq = 1; break; - default: current = sdscatlen(current, p, 1); break; - } - } - if (*p) p++; - } + if (!(*p)) break; + unsigned int len = 0; + if (sdsparsearg(p, &len, NULL)) { + sds current = sdsnewlen(SDS_NOINIT, len); + int parsedlen = sdsparsearg(p, NULL, current); + assert(parsedlen > 0); + p += parsedlen; + /* add the token to the vector */ vector = s_realloc(vector, ((*argc) + 1) * sizeof(char *)); vector[*argc] = current; (*argc)++; current = NULL; } else { - /* Even on empty input string return something not NULL. */ - if (vector == NULL) vector = s_malloc(sizeof(void *)); - return vector; + while ((*argc)--) sdsfree(vector[*argc]); + s_free(vector); + *argc = 0; + return NULL; } } - -err: - while ((*argc)--) sdsfree(vector[*argc]); - s_free(vector); - if (current) sdsfree(current); - *argc = 0; - return NULL; + /* Even on empty input string return something not NULL. */ + if (vector == NULL) vector = s_malloc(sizeof(void *)); + return vector; } /* Modify the string substituting all the occurrences of the set of diff --git a/src/sentinel.c b/src/sentinel.c index 711c4aea3e..ccd3ccbdca 100644 --- a/src/sentinel.c +++ b/src/sentinel.c @@ -416,8 +416,7 @@ void sentinelSimFailureCrash(void); void releaseSentinelValkeyInstance(sentinelValkeyInstance *ri); -void dictInstancesValDestructor(dict *d, void *obj) { - UNUSED(d); +void dictInstancesValDestructor(void *obj) { releaseSentinelValkeyInstance(obj); } @@ -4259,7 +4258,7 @@ void sentinelSetCommand(client *c) { /* If the target name is the same as the source name there * is no need to add an entry mapping to itself. */ - if (!dictSdsKeyCaseCompare(ri->renamed_commands, oldname, newname)) { + if (!dictSdsKeyCaseCompare(oldname, newname)) { oldname = sdsdup(oldname); newname = sdsdup(newname); dictAdd(ri->renamed_commands, oldname, newname); diff --git a/src/server.c b/src/server.c index 25d9b78d6c..f978ebc210 100644 --- a/src/server.c +++ b/src/server.c @@ -360,25 +360,20 @@ void exitFromChild(int retcode) { * keys and Objects as values (Objects can hold SDS strings, * lists, sets). */ -void dictVanillaFree(dict *d, void *val) { - UNUSED(d); +void dictVanillaFree(void *val) { zfree(val); } -void dictListDestructor(dict *d, void *val) { - UNUSED(d); +void dictListDestructor(void *val) { listRelease((list *)val); } -void dictDictDestructor(dict *d, void *val) { - UNUSED(d); +void dictDictDestructor(void *val) { dictRelease((dict *)val); } -int dictSdsKeyCompare(dict *d, const void *key1, const void *key2) { +int dictSdsKeyCompare(const void *key1, const void *key2) { int l1, l2; - UNUSED(d); - l1 = sdslen((sds)key1); l2 = sdslen((sds)key2); if (l1 != l2) return 0; @@ -391,30 +386,26 @@ size_t dictSdsEmbedKey(unsigned char *buf, size_t buf_len, const void *key, uint /* A case insensitive version used for the command lookup table and other * places where case insensitive non binary-safe comparison is needed. */ -int dictSdsKeyCaseCompare(dict *d, const void *key1, const void *key2) { - UNUSED(d); +int dictSdsKeyCaseCompare(const void *key1, const void *key2) { return strcasecmp(key1, key2) == 0; } -void dictObjectDestructor(dict *d, void *val) { - UNUSED(d); +void dictObjectDestructor(void *val) { if (val == NULL) return; /* Lazy freeing will set value to NULL. */ decrRefCount(val); } -void dictSdsDestructor(dict *d, void *val) { - UNUSED(d); +void dictSdsDestructor(void *val) { sdsfree(val); } -void *dictSdsDup(dict *d, const void *key) { - UNUSED(d); +void *dictSdsDup(const void *key) { return sdsdup((const sds)key); } -int dictObjKeyCompare(dict *d, const void *key1, const void *key2) { +int dictObjKeyCompare(const void *key1, const void *key2) { const robj *o1 = key1, *o2 = key2; - return dictSdsKeyCompare(d, o1->ptr, o2->ptr); + return dictSdsKeyCompare(o1->ptr, o2->ptr); } uint64_t dictObjHash(const void *key) { @@ -446,16 +437,13 @@ uint64_t dictClientHash(const void *key) { } /* Dict compare function for client */ -int dictClientKeyCompare(dict *d, const void *key1, const void *key2) { - UNUSED(d); +int dictClientKeyCompare(const void *key1, const void *key2) { return ((client *)key1)->id == ((client *)key2)->id; } /* Dict compare function for null terminated string */ -int dictCStrKeyCompare(dict *d, const void *key1, const void *key2) { +int dictCStrKeyCompare(const void *key1, const void *key2) { int l1, l2; - UNUSED(d); - l1 = strlen((char *)key1); l2 = strlen((char *)key2); if (l1 != l2) return 0; @@ -463,12 +451,11 @@ int dictCStrKeyCompare(dict *d, const void *key1, const void *key2) { } /* Dict case insensitive compare function for null terminated string */ -int dictCStrKeyCaseCompare(dict *d, const void *key1, const void *key2) { - UNUSED(d); +int dictCStrKeyCaseCompare(const void *key1, const void *key2) { return strcasecmp(key1, key2) == 0; } -int dictEncObjKeyCompare(dict *d, const void *key1, const void *key2) { +int dictEncObjKeyCompare(const void *key1, const void *key2) { robj *o1 = (robj *)key1, *o2 = (robj *)key2; int cmp; @@ -480,7 +467,7 @@ int dictEncObjKeyCompare(dict *d, const void *key1, const void *key2) { * objects as well. */ if (o1->refcount != OBJ_STATIC_REFCOUNT) o1 = getDecodedObject(o1); if (o2->refcount != OBJ_STATIC_REFCOUNT) o2 = getDecodedObject(o2); - cmp = dictSdsKeyCompare(d, o1->ptr, o2->ptr); + cmp = dictSdsKeyCompare(o1->ptr, o2->ptr); if (o1->refcount != OBJ_STATIC_REFCOUNT) decrRefCount(o1); if (o2->refcount != OBJ_STATIC_REFCOUNT) decrRefCount(o2); return cmp; @@ -902,9 +889,10 @@ int clientsCronResizeOutputBuffer(client *c, mstime_t now_ms) { if (new_buffer_size) { oldbuf = c->buf; + size_t oldbuf_size = c->buf_usable_size; c->buf = zmalloc_usable(new_buffer_size, &c->buf_usable_size); memcpy(c->buf, oldbuf, c->bufpos); - zfree(oldbuf); + zfree_with_size(oldbuf, oldbuf_size); } return 0; } @@ -983,9 +971,10 @@ void updateClientMemoryUsage(client *c) { } int clientEvictionAllowed(client *c) { - if (server.maxmemory_clients == 0 || c->flag.no_evict || !c->conn) { + if (server.maxmemory_clients == 0 || c->flag.no_evict || c->flag.fake) { return 0; } + serverAssert(c->conn); int type = getClientType(c); return (type == CLIENT_TYPE_NORMAL || type == CLIENT_TYPE_PUBSUB); } @@ -1144,15 +1133,15 @@ void databasesCron(void) { /* Expire keys by random sampling. Not required for replicas * as primary will synthesize DELs for us. */ if (server.active_expire_enabled) { - if (iAmPrimary()) { - activeExpireCycle(ACTIVE_EXPIRE_CYCLE_SLOW); - } else { + if (!iAmPrimary()) { expireReplicaKeys(); + } else if (!server.import_mode) { + activeExpireCycle(ACTIVE_EXPIRE_CYCLE_SLOW); } } - /* Defrag keys gradually. */ - activeDefragCycle(); + /* Start active defrag cycle or adjust defrag CPU if needed. */ + monitorActiveDefrag(); /* Perform hash tables rehashing if needed, but only if there are no * other processes saving the DB on disk. Otherwise rehashing is bad @@ -1310,8 +1299,8 @@ void cronUpdateMemoryStats(void) { * allocations, and allocator reserved pages that can be pursed (all not actual frag) */ zmalloc_get_allocator_info( &server.cron_malloc_stats.allocator_allocated, &server.cron_malloc_stats.allocator_active, - &server.cron_malloc_stats.allocator_resident, NULL, &server.cron_malloc_stats.allocator_muzzy, - &server.cron_malloc_stats.allocator_frag_smallbins_bytes); + &server.cron_malloc_stats.allocator_resident, NULL, &server.cron_malloc_stats.allocator_muzzy); + server.cron_malloc_stats.allocator_frag_smallbins_bytes = allocatorDefragGetFragSmallbins(); /* in case the allocator isn't providing these stats, fake them so that * fragmentation info still shows some (inaccurate metrics) */ if (!server.cron_malloc_stats.allocator_resident) { @@ -1622,24 +1611,7 @@ void whileBlockedCron(void) { mstime_t latency; latencyStartMonitor(latency); - /* In some cases we may be called with big intervals, so we may need to do - * extra work here. This is because some of the functions in serverCron rely - * on the fact that it is performed every 10 ms or so. For instance, if - * activeDefragCycle needs to utilize 25% cpu, it will utilize 2.5ms, so we - * need to call it multiple times. */ - long hz_ms = 1000 / server.hz; - while (server.blocked_last_cron < server.mstime) { - /* Defrag keys gradually. */ - activeDefragCycle(); - - server.blocked_last_cron += hz_ms; - - /* Increment cronloop so that run_with_period works. */ - server.cronloops++; - } - - /* Other cron jobs do not need to be done in a loop. No need to check - * server.blocked_last_cron since we have an early exit at the top. */ + defragWhileBlocked(); /* Update memory stats during loading (excluding blocked scripts) */ if (server.loading) cronUpdateMemoryStats(); @@ -1740,7 +1712,7 @@ void beforeSleep(struct aeEventLoop *eventLoop) { /* Run a fast expire cycle (the called function will return * ASAP if a fast cycle is not needed). */ - if (server.active_expire_enabled && iAmPrimary()) activeExpireCycle(ACTIVE_EXPIRE_CYCLE_FAST); + if (server.active_expire_enabled && !server.import_mode && iAmPrimary()) activeExpireCycle(ACTIVE_EXPIRE_CYCLE_FAST); if (moduleCount()) { moduleFireServerEvent(VALKEYMODULE_EVENT_EVENTLOOP, VALKEYMODULE_SUBEVENT_EVENTLOOP_BEFORE_SLEEP, NULL); @@ -2131,7 +2103,7 @@ void initServerConfig(void) { server.aof_flush_postponed_start = 0; server.aof_last_incr_size = 0; server.aof_last_incr_fsync_offset = 0; - server.active_defrag_running = 0; + server.active_defrag_cpu_percent = 0; server.active_defrag_configuration_changed = 0; server.notify_keyspace_events = 0; server.blocked_clients = 0; @@ -2146,6 +2118,7 @@ void initServerConfig(void) { server.extended_redis_compat = 0; server.pause_cron = 0; server.dict_resizing = 1; + server.import_mode = 0; server.latency_tracking_info_percentiles_len = 3; server.latency_tracking_info_percentiles = zmalloc(sizeof(double) * (server.latency_tracking_info_percentiles_len)); @@ -2494,19 +2467,6 @@ void checkTcpBacklogSettings(void) { #endif } -void closeListener(connListener *sfd) { - int j; - - for (j = 0; j < sfd->count; j++) { - if (sfd->fd[j] == -1) continue; - - aeDeleteFileEvent(server.el, sfd->fd[j], AE_READABLE); - close(sfd->fd[j]); - } - - sfd->count = 0; -} - /* Create an event handler for accepting new connections in TCP or TLS domain sockets. * This works atomically for all socket fds */ int createSocketAcceptHandler(connListener *sfd, aeFileProc *accept_handler) { @@ -2570,7 +2530,7 @@ int listenToPort(connListener *sfd) { continue; /* Rollback successful listens before exiting */ - closeListener(sfd); + connCloseListener(sfd); return C_ERR; } if (server.socket_mark_id > 0) anetSetSockMarkId(NULL, sfd->fd[sfd->count], server.socket_mark_id); @@ -2671,7 +2631,6 @@ void initServer(void) { server.aof_state = server.aof_enabled ? AOF_ON : AOF_OFF; server.fsynced_reploff = server.aof_enabled ? 0 : -1; server.hz = server.config_hz; - server.pid = getpid(); server.in_fork_child = CHILD_TYPE_NONE; server.rdb_pipe_read = -1; server.rdb_child_exit_pipe = -1; @@ -2707,6 +2666,7 @@ void initServer(void) { server.blocking_op_nesting = 0; server.thp_enabled = 0; server.cluster_drop_packet_filter = -1; + server.debug_cluster_disable_random_ping = 0; server.reply_buffer_peak_reset_time = REPLY_BUFFER_DEFAULT_PEAK_RESET_TIME; server.reply_buffer_resizing_enabled = 1; server.client_mem_usage_buckets = NULL; @@ -2746,8 +2706,6 @@ void initServer(void) { server.db[j].watched_keys = dictCreate(&keylistDictType); server.db[j].id = j; server.db[j].avg_ttl = 0; - server.db[j].defrag_later = listCreate(); - listSetFreeMethod(server.db[j].defrag_later, (void (*)(void *))sdsfree); } evictionPoolAlloc(); /* Initialize the LRU keys pool. */ /* Note that server.pubsub_channels was chosen to be a kvstore (with only one dict, which @@ -2911,6 +2869,17 @@ void initListeners(void) { listener->priv = &server.unix_ctx_config; /* Unix socket specified */ } + if (server.rdma_ctx_config.port != 0) { + conn_index = connectionIndexByType(CONN_TYPE_RDMA); + if (conn_index < 0) serverPanic("Failed finding connection listener of %s", CONN_TYPE_RDMA); + listener = &server.listeners[conn_index]; + listener->bindaddr = server.rdma_ctx_config.bindaddr; + listener->bindaddr_count = server.rdma_ctx_config.bindaddr_count; + listener->port = server.rdma_ctx_config.port; + listener->ct = connectionByType(CONN_TYPE_RDMA); + listener->priv = &server.rdma_ctx_config; + } + /* create all the configured listener, and add handler to start to accept */ int listen_fds = 0; for (int j = 0; j < CONN_TYPE_MAX; j++) { @@ -3330,8 +3299,28 @@ static void propagateNow(int dbid, robj **argv, int argc, int target) { if (!shouldPropagate(target)) return; /* This needs to be unreachable since the dataset should be fixed during - * replica pause (otherwise data may be lost during a failover) */ - serverAssert(!(isPausedActions(PAUSE_ACTION_REPLICA) && (!server.client_pause_in_transaction))); + * replica pause (otherwise data may be lost during a failover). + * + * Though, there are exceptions: + * + * 1. We allow write commands that were queued up before and after to + * execute, if a CLIENT PAUSE executed during a transaction, we will + * track the state, the CLIENT PAUSE takes effect only after a transaction + * has finished. + * 2. Primary loses a slot during the pause, deletes all keys and replicates + * DEL to its replicas. In this case, we will track the state, the dirty + * slots will be deleted in the end without affecting the data consistency. + * + * Note that case 2 can happen in one of the following scenarios: + * 1) The primary waits for the replica to replicate before exiting, see + * shutdown-timeout in conf for more details. In this case, primary lost + * a slot during the SIGTERM waiting. + * 2) The primary waits for the replica to replicate during a manual failover. + * In this case, primary lost a slot during the pausing. + * 3) The primary was paused by CLIENT PAUSE, and lost a slot during the + * pausing. */ + serverAssert(!isPausedActions(PAUSE_ACTION_REPLICA) || server.client_pause_in_transaction || + server.server_del_keys_in_slot); if (server.aof_state != AOF_OFF && target & PROPAGATE_AOF) feedAppendOnlyFile(dbid, argv, argc); if (target & PROPAGATE_REPL) replicationFeedReplicas(dbid, argv, argc); @@ -3671,10 +3660,6 @@ void call(client *c, int flags) { replicationFeedMonitors(c, server.monitors, c->db->id, argv, argc); } - /* Clear the original argv. - * If the client is blocked we will handle slowlog when it is unblocked. */ - if (!c->flag.blocked) freeClientOriginalArgv(c); - /* Populate the per-command and per-slot statistics that we show in INFO commandstats and CLUSTER SLOT-STATS, * respectively. If the client is blocked we will handle latency stats and duration when it is unblocked. */ if (update_command_stats && !c->flag.blocked) { @@ -4166,12 +4151,6 @@ int processCommand(client *c) { return C_OK; } - /* Not allow several UNSUBSCRIBE commands executed under non-pubsub mode */ - if (!c->flag.pubsub && (c->cmd->proc == unsubscribeCommand || c->cmd->proc == sunsubscribeCommand || - c->cmd->proc == punsubscribeCommand)) { - rejectCommandFormat(c, "-NOSUB '%s' command executed not in subscribed mode", c->cmd->fullname); - return C_OK; - } /* Only allow commands with flag "t", such as INFO, REPLICAOF and so on, * when replica-serve-stale-data is no and we are a replica with a broken * link with primary. */ @@ -4324,7 +4303,7 @@ int prepareForShutdown(client *c, int flags) { server.shutdown_flags = flags; if (c != NULL) { - sds client = catClientInfoString(sdsempty(), c, server.hide_user_data_from_log); + sds client = catClientInfoShortString(sdsempty(), c, server.hide_user_data_from_log); serverLog(LL_NOTICE, "User requested shutdown... (user request from '%s')", client); sdsfree(client); } else { @@ -5703,7 +5682,7 @@ sds genValkeyInfoString(dict *section_dict, int all_sections, int everything) { "mem_aof_buffer:%zu\r\n", mh->aof_buffer, "mem_allocator:%s\r\n", ZMALLOC_LIB, "mem_overhead_db_hashtable_rehashing:%zu\r\n", mh->overhead_db_hashtable_rehashing, - "active_defrag_running:%d\r\n", server.active_defrag_running, + "active_defrag_running:%d\r\n", server.active_defrag_cpu_percent, "lazyfree_pending_objects:%zu\r\n", lazyfreeGetPendingObjectsCount(), "lazyfreed_objects:%zu\r\n", lazyfreeGetFreedObjectsCount())); freeMemoryOverheadData(mh); @@ -6295,7 +6274,7 @@ connListener *listenerByType(const char *typename) { /* Close original listener, re-create a new listener from the updated bind address & port */ int changeListener(connListener *listener) { /* Close old servers */ - closeListener(listener); + connCloseListener(listener); /* Just close the server if port disabled */ if (listener->port == 0) { @@ -6397,7 +6376,7 @@ void closeChildUnusedResourceAfterFork(void) { int serverFork(int purpose) { if (isMutuallyExclusiveChildType(purpose)) { if (hasActiveChildProcess()) { - errno = EEXIST; + errno = EALREADY; return -1; } @@ -6782,91 +6761,22 @@ int iAmPrimary(void) { (server.cluster_enabled && clusterNodeIsPrimary(getMyClusterNode()))); } -#ifdef SERVER_TEST -#include "testhelp.h" -#include "intset.h" /* Compact integer set structure */ - -int __failed_tests = 0; -int __test_num = 0; - -/* The flags are the following: - * --accurate: Runs tests with more iterations. - * --large-memory: Enables tests that consume more than 100mb. */ -typedef int serverTestProc(int argc, char **argv, int flags); -struct serverTest { - char *name; - serverTestProc *proc; - int failed; -} serverTests[] = { - {"quicklist", quicklistTest}, -}; -serverTestProc *getTestProcByName(const char *name) { - int numtests = sizeof(serverTests) / sizeof(struct serverTest); - for (int j = 0; j < numtests; j++) { - if (!strcasecmp(name, serverTests[j].name)) { - return serverTests[j].proc; - } - } - return NULL; -} -#endif - -int main(int argc, char **argv) { +/* Main is marked as weak so that unit tests can use their own main function. */ +__attribute__((weak)) int main(int argc, char **argv) { struct timeval tv; int j; char config_from_stdin = 0; -#ifdef SERVER_TEST - monotonicInit(); /* Required for dict tests, that are relying on monotime during dict rehashing. */ - if (argc >= 3 && !strcasecmp(argv[1], "test")) { - int flags = 0; - for (j = 3; j < argc; j++) { - char *arg = argv[j]; - if (!strcasecmp(arg, "--accurate")) - flags |= TEST_ACCURATE; - else if (!strcasecmp(arg, "--large-memory")) - flags |= TEST_LARGE_MEMORY; - else if (!strcasecmp(arg, "--valgrind")) - flags |= TEST_VALGRIND; - } - - if (!strcasecmp(argv[2], "all")) { - int numtests = sizeof(serverTests) / sizeof(struct serverTest); - for (j = 0; j < numtests; j++) { - serverTests[j].failed = (serverTests[j].proc(argc, argv, flags) != 0); - } - - /* Report tests result */ - int failed_num = 0; - for (j = 0; j < numtests; j++) { - if (serverTests[j].failed) { - failed_num++; - printf("[failed] Test - %s\n", serverTests[j].name); - } else { - printf("[ok] Test - %s\n", serverTests[j].name); - } - } - - printf("%d tests, %d passed, %d failed\n", numtests, numtests - failed_num, failed_num); - - return failed_num == 0 ? 0 : 1; - } else { - serverTestProc *proc = getTestProcByName(argv[2]); - if (!proc) return -1; /* test not found */ - return proc(argc, argv, flags); - } - - return 0; - } -#endif - /* We need to initialize our libraries, and the server configuration. */ #ifdef INIT_SETPROCTITLE_REPLACEMENT spt_init(argc, argv); #endif tzset(); /* Populates 'timezone' global. */ zmalloc_set_oom_handler(serverOutOfMemoryHandler); - +#if defined(HAVE_DEFRAG) + int res = allocatorDefragInit(); + serverAssert(res == 0); +#endif /* To achieve entropy, in case of containers, their time() and getpid() can * be the same. But value of tv_usec is fast enough to make the difference */ gettimeofday(&tv, NULL); @@ -6889,6 +6799,7 @@ int main(int argc, char **argv) { if (exec_name == NULL) exec_name = argv[0]; server.sentinel_mode = checkForSentinelMode(argc, argv, exec_name); initServerConfig(); + server.pid = getpid(); ACLInit(); /* The ACL subsystem must be initialized ASAP because the basic networking code and client creation depends on it. */ moduleInitModulesSystem(); @@ -7070,7 +6981,12 @@ int main(int argc, char **argv) { /* Daemonize if needed */ server.supervised = serverIsSupervised(server.supervised_mode); int background = server.daemonize && !server.supervised; - if (background) daemonize(); + if (background) { + /* We need to reset server.pid after daemonize(), otherwise the + * log printing role will always be the child. */ + daemonize(); + server.pid = getpid(); + } serverLog(LL_NOTICE, "oO0OoO0OoO0Oo Valkey is starting oO0OoO0OoO0Oo"); serverLog(LL_NOTICE, "Valkey version=%s, bits=%d, commit=%s, modified=%d, pid=%d, just started", VALKEY_VERSION, @@ -7154,5 +7070,4 @@ int main(int argc, char **argv) { aeDeleteEventLoop(server.el); return 0; } - /* The End */ diff --git a/src/server.h b/src/server.h index 2803a71af7..c5e686f7fd 100644 --- a/src/server.h +++ b/src/server.h @@ -35,6 +35,7 @@ #include "solarisfixes.h" #include "rio.h" #include "commands.h" +#include "allocator_defrag.h" #include #include @@ -960,7 +961,6 @@ typedef struct serverDb { int id; /* Database ID */ long long avg_ttl; /* Average TTL, just for stats */ unsigned long expires_cursor; /* Cursor of the active expire cycle. */ - list *defrag_later; /* List of key names to attempt to defrag one by one, gradually. */ } serverDb; /* forward declaration for functions ctx */ @@ -1093,9 +1093,10 @@ typedef struct { /* With multiplexing we need to take per-client state. * Clients are taken in a linked list. */ -#define CLIENT_ID_AOF (UINT64_MAX) /* Reserved ID for the AOF client. If you \ - need more reserved IDs use UINT64_MAX-1, \ - -2, ... and so forth. */ +#define CLIENT_ID_AOF (UINT64_MAX) /* Reserved ID for the AOF client. If you \ + need more reserved IDs use UINT64_MAX-1, \ + -2, ... and so forth. */ +#define CLIENT_ID_CACHED_RESPONSE (UINT64_MAX - 1) /* Client for cached response, see createCachedResponseClient. */ /* Replication backlog is not a separate memory, it just is one consumer of * the global replication buffer. This structure records the reference of @@ -1233,7 +1234,8 @@ typedef struct ClientFlags { * knows that it does not need the cache and required a full sync. With this * flag, we won't cache the primary in freeClient. */ uint64_t fake : 1; /* This is a fake client without a real connection. */ - uint64_t reserved : 5; /* Reserved for future use */ + uint64_t import_source : 1; /* This client is importing data to server and can visit expired key. */ + uint64_t reserved : 4; /* Reserved for future use */ } ClientFlags; typedef struct client { @@ -1447,6 +1449,10 @@ typedef struct zskiplistNode { struct zskiplistNode *backward; struct zskiplistLevel { struct zskiplistNode *forward; + /* At each level we keep the span, which is the number of elements which are on the "subtree" + * from this node at this level to the next node at the same level. + * One exception is the value at level 0. In level 0 the span can only be 1 or 0 (in case the last elements in the list) + * So we use it in order to hold the height of the node, which is the number of levels. */ unsigned long span; } level[]; } zskiplistNode; @@ -1611,6 +1617,17 @@ typedef struct serverUnixContextConfig { unsigned int perm; /* UNIX socket permission (see mode_t) */ } serverUnixContextConfig; +/*----------------------------------------------------------------------------- + * RDMA Context Configuration + *----------------------------------------------------------------------------*/ +typedef struct serverRdmaContextConfig { + char *bindaddr[CONFIG_BINDADDR_MAX]; + int bindaddr_count; + int port; + int rx_size; + int completion_vector; +} serverRdmaContextConfig; + /*----------------------------------------------------------------------------- * AOF manifest definition *----------------------------------------------------------------------------*/ @@ -1688,7 +1705,7 @@ struct valkeyServer { int last_sig_received; /* Indicates the last SIGNAL received, if any (e.g., SIGINT or SIGTERM). */ int shutdown_flags; /* Flags passed to prepareForShutdown(). */ int activerehashing; /* Incremental rehash in serverCron() */ - int active_defrag_running; /* Active defragmentation running (holds current scan aggressiveness) */ + int active_defrag_cpu_percent; /* Current desired CPU percentage for active defrag */ char *pidfile; /* PID file path */ int arch_bits; /* 32 or 64 depending on sizeof(long) */ int cronloops; /* Number of times the cron function run */ @@ -1701,6 +1718,7 @@ struct valkeyServer { const char *busy_module_yield_reply; /* When non-null, we are inside RM_Yield. */ char *ignore_warnings; /* Config: warnings that should be ignored. */ int client_pause_in_transaction; /* Was a client pause executed during this Exec? */ + int server_del_keys_in_slot; /* The server is deleting the keys in the dirty slot. */ int thp_enabled; /* If true, THP is enabled. */ size_t page_size; /* The page size of OS. */ /* Modules */ @@ -1884,8 +1902,9 @@ struct valkeyServer { size_t active_defrag_ignore_bytes; /* minimum amount of fragmentation waste to start active defrag */ int active_defrag_threshold_lower; /* minimum percentage of fragmentation to start active defrag */ int active_defrag_threshold_upper; /* maximum percentage of fragmentation at which we use maximum effort */ - int active_defrag_cycle_min; /* minimal effort for defrag in CPU percentage */ - int active_defrag_cycle_max; /* maximal effort for defrag in CPU percentage */ + int active_defrag_cpu_min; /* minimal effort for defrag in CPU percentage */ + int active_defrag_cpu_max; /* maximal effort for defrag in CPU percentage */ + int active_defrag_cycle_us; /* standard duration of defrag cycle */ unsigned long active_defrag_max_scan_fields; /* maximum number of fields of set/hash/zset/list to process from within the main dict scan */ size_t client_max_querybuf_len; /* Limit for client query buffer length */ @@ -2089,6 +2108,8 @@ struct valkeyServer { char primary_replid[CONFIG_RUN_ID_SIZE + 1]; /* Primary PSYNC runid. */ long long primary_initial_offset; /* Primary PSYNC offset. */ int repl_replica_lazy_flush; /* Lazy FLUSHALL before loading DB? */ + /* Import Mode */ + int import_mode; /* If true, server is in import mode and forbid expiration and eviction. */ /* Synchronous replication. */ list *clients_waiting_acks; /* Clients waiting in WAIT or WAITAOF. */ int get_ack_from_replicas; /* If true we send REPLCONF GETACK. */ @@ -2190,6 +2211,8 @@ struct valkeyServer { int cluster_slot_stats_enabled; /* Cluster slot usage statistics tracking enabled. */ /* Debug config that goes along with cluster_drop_packet_filter. When set, the link is closed on packet drop. */ uint32_t debug_cluster_close_link_on_packet_drop : 1; + /* Debug config to control the random ping. When set, we will disable the random ping in clusterCron. */ + uint32_t debug_cluster_disable_random_ping : 1; sds cached_cluster_slot_info[CACHE_CONN_TYPE_MAX]; /* Index in array is a bitwise or of CACHE_CONN_TYPE_* */ /* Scripting */ mstime_t busy_reply_threshold; /* Script / module timeout in milliseconds */ @@ -2222,6 +2245,7 @@ struct valkeyServer { int tls_auth_clients; serverTLSContextConfig tls_ctx_config; serverUnixContextConfig unix_ctx_config; + serverRdmaContextConfig rdma_ctx_config; /* cpu affinity */ char *server_cpulist; /* cpu affinity list of server main/io thread. */ char *bio_cpulist; /* cpu affinity list of bio thread. */ @@ -2713,7 +2737,7 @@ size_t moduleGetFreeEffort(robj *key, robj *val, int dbid); size_t moduleGetMemUsage(robj *key, robj *val, size_t sample_size, int dbid); robj *moduleTypeDupOrReply(client *c, robj *fromkey, robj *tokey, int todb, robj *value); int moduleDefragValue(robj *key, robj *obj, int dbid); -int moduleLateDefrag(robj *key, robj *value, unsigned long *cursor, long long endtime, int dbid); +int moduleLateDefrag(robj *key, robj *value, unsigned long *cursor, monotime endtime, int dbid); void moduleDefragGlobals(void); void *moduleGetHandleByName(char *modulename); int moduleIsModuleCommand(void *module_handle, struct serverCommand *cmd); @@ -2731,7 +2755,7 @@ int serverSetProcTitle(char *title); int validateProcTitleTemplate(const char *template); int serverCommunicateSystemd(const char *sd_notify_msg); void serverSetCpuAffinity(const char *cpulist); -void dictVanillaFree(dict *d, void *val); +void dictVanillaFree(void *val); /* ERROR STATS constants */ @@ -2847,6 +2871,7 @@ char *getClientPeerId(client *client); char *getClientSockName(client *client); int isClientConnIpV6(client *c); sds catClientInfoString(sds s, client *client, int hide_user_data); +sds catClientInfoShortString(sds s, client *client, int hide_user_data); sds getAllClientsInfoString(int type, int hide_user_data); int clientSetName(client *c, robj *name, const char **err); void rewriteClientCommandVector(client *c, int argc, ...); @@ -2864,7 +2889,7 @@ void flushReplicasOutputBuffers(void); void disconnectReplicas(void); void evictClients(void); int listenToPort(connListener *fds); -void pauseActions(pause_purpose purpose, mstime_t end, uint32_t actions_bitmask); +void pauseActions(pause_purpose purpose, mstime_t end, uint32_t actions); void unpauseActions(pause_purpose purpose); uint32_t isPausedActions(uint32_t action_bitmask); uint32_t isPausedActionsWithUpdate(uint32_t action_bitmask); @@ -3285,7 +3310,6 @@ void setupSignalHandlers(void); int createSocketAcceptHandler(connListener *sfd, aeFileProc *accept_handler); connListener *listenerByType(const char *typename); int changeListener(connListener *listener); -void closeListener(connListener *listener); struct serverCommand *lookupSubcommand(struct serverCommand *container, sds sub_name); struct serverCommand *lookupCommand(robj **argv, int argc); struct serverCommand *lookupCommandBySdsLogic(dict *commands, sds s); @@ -3334,7 +3358,8 @@ void bytesToHuman(char *s, size_t size, unsigned long long n); void enterExecutionUnit(int update_cached_time, long long us); void exitExecutionUnit(void); void resetServerStats(void); -void activeDefragCycle(void); +void monitorActiveDefrag(void); +void defragWhileBlocked(void); unsigned int getLRUClock(void); unsigned int LRU_CLOCK(void); const char *evictPolicyToString(void); @@ -3565,7 +3590,7 @@ long long emptyDbStructure(serverDb *dbarray, int dbnum, int async, void(callbac void flushAllDataAndResetRDB(int flags); long long dbTotalServerKeyCount(void); serverDb *initTempDb(void); -void discardTempDb(serverDb *tempDb, void(callback)(dict *)); +void discardTempDb(serverDb *tempDb); int selectDb(client *c, int id); @@ -3718,11 +3743,11 @@ void startEvictionTimeProc(void); /* Keys hashing / comparison functions for dict.c hash tables. */ uint64_t dictSdsHash(const void *key); uint64_t dictSdsCaseHash(const void *key); -int dictSdsKeyCompare(dict *d, const void *key1, const void *key2); -int dictSdsKeyCaseCompare(dict *d, const void *key1, const void *key2); -void dictSdsDestructor(dict *d, void *val); -void dictListDestructor(dict *d, void *val); -void *dictSdsDup(dict *d, const void *key); +int dictSdsKeyCompare(const void *key1, const void *key2); +int dictSdsKeyCaseCompare(const void *key1, const void *key2); +void dictSdsDestructor(void *val); +void dictListDestructor(void *val); +void *dictSdsDup(const void *key); /* Git SHA1 */ char *serverGitSHA1(void); @@ -4037,6 +4062,11 @@ void debugPauseProcess(void); _serverLog(level, __VA_ARGS__); \ } while (0) +/* dualChannelServerLog - Log messages related to dual-channel operations + * This macro wraps the serverLog function, prepending "" + * to the log message. */ +#define dualChannelServerLog(level, ...) serverLog(level, " " __VA_ARGS__) + #define serverDebug(fmt, ...) printf("DEBUG %s:%d > " fmt "\n", __FILE__, __LINE__, __VA_ARGS__) #define serverDebugMark() printf("-- MARK %s:%d --\n", __FILE__, __LINE__) diff --git a/src/socket.c b/src/socket.c index 7344d66ad8..d89e6c8767 100644 --- a/src/socket.c +++ b/src/socket.c @@ -339,6 +339,19 @@ static int connSocketListen(connListener *listener) { return listenToPort(listener); } +static void connSocketCloseListener(connListener *listener) { + int j; + + for (j = 0; j < listener->count; j++) { + if (listener->fd[j] == -1) continue; + + aeDeleteFileEvent(server.el, listener->fd[j], AE_READABLE); + close(listener->fd[j]); + } + + listener->count = 0; +} + static int connSocketBlockingConnect(connection *conn, const char *addr, int port, long long timeout) { int fd = anetTcpNonBlockConnect(NULL, addr, port); if (fd == -1) { @@ -395,6 +408,7 @@ static ConnectionType CT_Socket = { .addr = connSocketAddr, .is_local = connSocketIsLocal, .listen = connSocketListen, + .closeListener = connSocketCloseListener, /* create/shutdown/close connection */ .conn_create = connCreateSocket, diff --git a/src/sort.c b/src/sort.c index 92777b068c..ad0496da79 100644 --- a/src/sort.c +++ b/src/sort.c @@ -34,6 +34,8 @@ #include /* isnan() */ #include "cluster.h" +#include "valkey_strtod.h" + zskiplistNode *zslGetElementByRank(zskiplist *zsl, unsigned long rank); serverSortOperation *createSortOperation(int type, robj *pattern) { @@ -479,9 +481,9 @@ void sortCommandGeneric(client *c, int readonly) { } else { if (sdsEncodedObject(byval)) { char *eptr; - - vector[j].u.score = strtod(byval->ptr, &eptr); - if (eptr[0] != '\0' || errno == ERANGE || isnan(vector[j].u.score)) { + errno = 0; + vector[j].u.score = valkey_strtod(byval->ptr, &eptr); + if (eptr[0] != '\0' || errno == ERANGE || errno == EINVAL || isnan(vector[j].u.score)) { int_conversion_error = 1; } } else if (byval->encoding == OBJ_ENCODING_INT) { diff --git a/src/t_zset.c b/src/t_zset.c index 069ab0924a..36a9bfffb1 100644 --- a/src/t_zset.c +++ b/src/t_zset.c @@ -60,6 +60,8 @@ #include "intset.h" /* Compact integer set structure */ #include +#include "valkey_strtod.h" + /*----------------------------------------------------------------------------- * Skiplist implementation of the low level API *----------------------------------------------------------------------------*/ @@ -70,12 +72,51 @@ void zsetConvertAndExpand(robj *zobj, int encoding, unsigned long cap); zskiplistNode *zslGetElementByRankFromNode(zskiplistNode *start_node, int start_level, unsigned long rank); zskiplistNode *zslGetElementByRank(zskiplist *zsl, unsigned long rank); +static inline unsigned long zslGetNodeSpanAtLevel(zskiplistNode *x, int level) { + /* We use the level 0 span in order to hold the node height, so in case the span is requested on + * level 0 and this is not the last node we return 1 and 0 otherwise. For the rest of the levels we just return + * the recorded span in that level. */ + if (level > 0) return x->level[level].span; + return x->level[level].forward ? 1 : 0; +} + +static inline void zslSetNodeSpanAtLevel(zskiplistNode *x, int level, unsigned long span) { + /* We use the level 0 span in order to hold the node height, so we avoid overriding it. */ + if (level > 0) + x->level[level].span = span; +} + +static inline void zslIncrNodeSpanAtLevel(zskiplistNode *x, int level, unsigned long incr) { + /* We use the level 0 span in order to hold the node height, so we avoid overriding it. */ + if (level > 0) + x->level[level].span += incr; +} + +static inline void zslDecrNodeSpanAtLevel(zskiplistNode *x, int level, unsigned long decr) { + /* We use the level 0 span in order to hold the node height, so we avoid overriding it. */ + if (level > 0) + x->level[level].span -= decr; +} + +static inline unsigned long zslGetNodeHeight(zskiplistNode *x) { + /* Since the span at level 0 is always 1 (or 0 for the last node), this + * field is instead used for storing the height of the node. */ + return x->level[0].span; +} + +static inline void zslSetNodeHeight(zskiplistNode *x, int height) { + /* Since the span at level 0 is always 1 (or 0 for the last node), this + * field is instead used for storing the height of the node. */ + x->level[0].span = height; +} + /* Create a skiplist node with the specified number of levels. * The SDS string 'ele' is referenced by the node after the call. */ -zskiplistNode *zslCreateNode(int level, double score, sds ele) { - zskiplistNode *zn = zmalloc(sizeof(*zn) + level * sizeof(struct zskiplistLevel)); +zskiplistNode *zslCreateNode(int height, double score, sds ele) { + zskiplistNode *zn = zmalloc(sizeof(*zn) + height * sizeof(struct zskiplistLevel)); zn->score = score; zn->ele = ele; + zslSetNodeHeight(zn, height); return zn; } @@ -145,7 +186,7 @@ zskiplistNode *zslInsert(zskiplist *zsl, double score, sds ele) { while (x->level[i].forward && (x->level[i].forward->score < score || (x->level[i].forward->score == score && sdscmp(x->level[i].forward->ele, ele) < 0))) { - rank[i] += x->level[i].span; + rank[i] += zslGetNodeSpanAtLevel(x, i); x = x->level[i].forward; } update[i] = x; @@ -159,9 +200,10 @@ zskiplistNode *zslInsert(zskiplist *zsl, double score, sds ele) { for (i = zsl->level; i < level; i++) { rank[i] = 0; update[i] = zsl->header; - update[i]->level[i].span = zsl->length; + zslSetNodeSpanAtLevel(update[i], i, zsl->length); } zsl->level = level; + zslSetNodeHeight(zsl->header, level); } x = zslCreateNode(level, score, ele); for (i = 0; i < level; i++) { @@ -169,13 +211,13 @@ zskiplistNode *zslInsert(zskiplist *zsl, double score, sds ele) { update[i]->level[i].forward = x; /* update span covered by update[i] as x is inserted here */ - x->level[i].span = update[i]->level[i].span - (rank[0] - rank[i]); - update[i]->level[i].span = (rank[0] - rank[i]) + 1; + zslSetNodeSpanAtLevel(x, i, zslGetNodeSpanAtLevel(update[i], i) - (rank[0] - rank[i])); + zslSetNodeSpanAtLevel(update[i], i, (rank[0] - rank[i]) + 1); } /* increment span for untouched levels */ for (i = level; i < zsl->level; i++) { - update[i]->level[i].span++; + zslIncrNodeSpanAtLevel(update[i], i, 1); } x->backward = (update[0] == zsl->header) ? NULL : update[0]; @@ -193,10 +235,10 @@ void zslDeleteNode(zskiplist *zsl, zskiplistNode *x, zskiplistNode **update) { int i; for (i = 0; i < zsl->level; i++) { if (update[i]->level[i].forward == x) { - update[i]->level[i].span += x->level[i].span - 1; + zslIncrNodeSpanAtLevel(update[i], i, zslGetNodeSpanAtLevel(x, i) - 1); update[i]->level[i].forward = x->level[i].forward; } else { - update[i]->level[i].span -= 1; + zslDecrNodeSpanAtLevel(update[i], i, 1); } } if (x->level[0].forward) { @@ -334,7 +376,7 @@ zskiplistNode *zslNthInRange(zskiplist *zsl, zrangespec *range, long n) { x = zsl->header; i = zsl->level - 1; while (x->level[i].forward && !zslValueGteMin(x->level[i].forward->score, range)) { - edge_rank += x->level[i].span; + edge_rank += zslGetNodeSpanAtLevel(x, i); x = x->level[i].forward; } /* Remember the last node which has zsl->level-1 levels and its rank. */ @@ -346,7 +388,7 @@ zskiplistNode *zslNthInRange(zskiplist *zsl, zrangespec *range, long n) { /* Go forward while *OUT* of range. */ while (x->level[i].forward && !zslValueGteMin(x->level[i].forward->score, range)) { /* Count the rank of the last element smaller than the range. */ - edge_rank += x->level[i].span; + edge_rank += zslGetNodeSpanAtLevel(x, i); x = x->level[i].forward; } } @@ -370,7 +412,7 @@ zskiplistNode *zslNthInRange(zskiplist *zsl, zrangespec *range, long n) { /* Go forward while *IN* range. */ while (x->level[i].forward && zslValueLteMax(x->level[i].forward->score, range)) { /* Count the rank of the last element in range. */ - edge_rank += x->level[i].span; + edge_rank += zslGetNodeSpanAtLevel(x, i); x = x->level[i].forward; } } @@ -462,8 +504,8 @@ unsigned long zslDeleteRangeByRank(zskiplist *zsl, unsigned int start, unsigned x = zsl->header; for (i = zsl->level - 1; i >= 0; i--) { - while (x->level[i].forward && (traversed + x->level[i].span) < start) { - traversed += x->level[i].span; + while (x->level[i].forward && (traversed + zslGetNodeSpanAtLevel(x, i)) < start) { + traversed += zslGetNodeSpanAtLevel(x, i); x = x->level[i].forward; } update[i] = x; @@ -497,7 +539,7 @@ unsigned long zslGetRank(zskiplist *zsl, double score, sds ele) { while (x->level[i].forward && (x->level[i].forward->score < score || (x->level[i].forward->score == score && sdscmp(x->level[i].forward->ele, ele) <= 0))) { - rank += x->level[i].span; + rank += zslGetNodeSpanAtLevel(x, i); x = x->level[i].forward; } @@ -509,6 +551,18 @@ unsigned long zslGetRank(zskiplist *zsl, double score, sds ele) { return 0; } +/* Find the rank for a specific skiplist node. */ +unsigned long zslGetRankByNode(zskiplist *zsl, zskiplistNode *x) { + int i = zslGetNodeHeight(x) - 1; + unsigned long rank = zslGetNodeSpanAtLevel(x, i); + while (x->level[zslGetNodeHeight(x) - 1].forward) { + x = x->level[zslGetNodeHeight(x) - 1].forward; + rank += zslGetNodeSpanAtLevel(x, zslGetNodeHeight(x) - 1); + } + rank = zsl->length - rank; + return rank; +} + /* Finds an element by its rank from start node. The rank argument needs to be 1-based. */ zskiplistNode *zslGetElementByRankFromNode(zskiplistNode *start_node, int start_level, unsigned long rank) { zskiplistNode *x; @@ -517,8 +571,8 @@ zskiplistNode *zslGetElementByRankFromNode(zskiplistNode *start_node, int start_ x = start_node; for (i = start_level; i >= 0; i--) { - while (x->level[i].forward && (traversed + x->level[i].span) <= rank) { - traversed += x->level[i].span; + while (x->level[i].forward && (traversed + zslGetNodeSpanAtLevel(x, i)) <= rank) { + traversed += zslGetNodeSpanAtLevel(x, i); x = x->level[i].forward; } if (traversed == rank) { @@ -546,11 +600,11 @@ static int zslParseRange(robj *min, robj *max, zrangespec *spec) { spec->min = (long)min->ptr; } else { if (((char *)min->ptr)[0] == '(') { - spec->min = strtod((char *)min->ptr + 1, &eptr); + spec->min = valkey_strtod((char *)min->ptr + 1, &eptr); if (eptr[0] != '\0' || isnan(spec->min)) return C_ERR; spec->minex = 1; } else { - spec->min = strtod((char *)min->ptr, &eptr); + spec->min = valkey_strtod((char *)min->ptr, &eptr); if (eptr[0] != '\0' || isnan(spec->min)) return C_ERR; } } @@ -558,11 +612,11 @@ static int zslParseRange(robj *min, robj *max, zrangespec *spec) { spec->max = (long)max->ptr; } else { if (((char *)max->ptr)[0] == '(') { - spec->max = strtod((char *)max->ptr + 1, &eptr); + spec->max = valkey_strtod((char *)max->ptr + 1, &eptr); if (eptr[0] != '\0' || isnan(spec->max)) return C_ERR; spec->maxex = 1; } else { - spec->max = strtod((char *)max->ptr, &eptr); + spec->max = valkey_strtod((char *)max->ptr, &eptr); if (eptr[0] != '\0' || isnan(spec->max)) return C_ERR; } } @@ -688,7 +742,7 @@ zskiplistNode *zslNthInLexRange(zskiplist *zsl, zlexrangespec *range, long n) { x = zsl->header; i = zsl->level - 1; while (x->level[i].forward && !zslLexValueGteMin(x->level[i].forward->ele, range)) { - edge_rank += x->level[i].span; + edge_rank += zslGetNodeSpanAtLevel(x, i); x = x->level[i].forward; } /* Remember the last node which has zsl->level-1 levels and its rank. */ @@ -700,7 +754,7 @@ zskiplistNode *zslNthInLexRange(zskiplist *zsl, zlexrangespec *range, long n) { /* Go forward while *OUT* of range. */ while (x->level[i].forward && !zslLexValueGteMin(x->level[i].forward->ele, range)) { /* Count the rank of the last element smaller than the range. */ - edge_rank += x->level[i].span; + edge_rank += zslGetNodeSpanAtLevel(x, i); x = x->level[i].forward; } } @@ -724,7 +778,7 @@ zskiplistNode *zslNthInLexRange(zskiplist *zsl, zlexrangespec *range, long n) { /* Go forward while *IN* range. */ while (x->level[i].forward && zslLexValueLteMax(x->level[i].forward->ele, range)) { /* Count the rank of the last element in range. */ - edge_rank += x->level[i].span; + edge_rank += zslGetNodeSpanAtLevel(x, i); x = x->level[i].forward; } } @@ -757,7 +811,7 @@ double zzlStrtod(unsigned char *vstr, unsigned int vlen) { if (vlen > sizeof(buf) - 1) vlen = sizeof(buf) - 1; memcpy(buf, vstr, vlen); buf[vlen] = '\0'; - return strtod(buf, NULL); + return valkey_strtod(buf, NULL); } double zzlGetScore(unsigned char *sptr) { @@ -1171,6 +1225,13 @@ unsigned char *zzlDeleteRangeByRank(unsigned char *zl, unsigned int start, unsig * Common sorted set API *----------------------------------------------------------------------------*/ +/* Utility function used for mapping the hashtable entry to the matching skiplist node. + * For example, this is used in case of ZRANK query. */ +static inline zskiplistNode *zsetGetSLNodeByEntry(dictEntry *de) { + char *score_ref = ((char *)dictGetVal(de)); + return (zskiplistNode *)(score_ref - offsetof(zskiplistNode, score)); +} + unsigned long zsetLength(const robj *zobj) { unsigned long length = 0; if (zobj->encoding == OBJ_ENCODING_LISTPACK) { @@ -1601,8 +1662,9 @@ long zsetRank(robj *zobj, sds ele, int reverse, double *output_score) { de = dictFind(zs->dict, ele); if (de != NULL) { - score = *(double *)dictGetVal(de); - rank = zslGetRank(zsl, score, ele); + zskiplistNode *n = zsetGetSLNodeByEntry(de); + score = n->score; + rank = zslGetRankByNode(zsl, n); /* Existing elements always have a rank. */ serverAssert(rank != 0); if (output_score) *output_score = score; diff --git a/src/tls.c b/src/tls.c index f1c82d35e4..48b75553de 100644 --- a/src/tls.c +++ b/src/tls.c @@ -446,6 +446,7 @@ typedef enum { #define TLS_CONN_FLAG_WRITE_WANT_READ (1 << 1) #define TLS_CONN_FLAG_FD_SET (1 << 2) #define TLS_CONN_FLAG_POSTPONE_UPDATE_STATE (1 << 3) +#define TLS_CONN_FLAG_HAS_PENDING (1 << 4) typedef struct tls_connection { connection c; @@ -614,7 +615,7 @@ static void updatePendingData(tls_connection *conn) { /* If SSL has pending data, already read from the socket, we're at risk of not calling the read handler again, make * sure to add it to a list of pending connection that should be handled anyway. */ - if (SSL_pending(conn->ssl) > 0) { + if (conn->flags & TLS_CONN_FLAG_HAS_PENDING) { if (!conn->pending_list_node) { listAddNodeTail(pending_list, conn); conn->pending_list_node = listLast(pending_list); @@ -625,6 +626,14 @@ static void updatePendingData(tls_connection *conn) { } } +void updateSSLPendingFlag(tls_connection *conn) { + if (SSL_pending(conn->ssl) > 0) { + conn->flags |= TLS_CONN_FLAG_HAS_PENDING; + } else { + conn->flags &= ~TLS_CONN_FLAG_HAS_PENDING; + } +} + static void updateSSLEvent(tls_connection *conn) { if (conn->flags & TLS_CONN_FLAG_POSTPONE_UPDATE_STATE) return; @@ -653,8 +662,6 @@ static void tlsHandleEvent(tls_connection *conn, int mask) { TLSCONN_DEBUG("tlsEventHandler(): fd=%d, state=%d, mask=%d, r=%d, w=%d, flags=%d", fd, conn->c.state, mask, conn->c.read_handler != NULL, conn->c.write_handler != NULL, conn->flags); - ERR_clear_error(); - switch (conn->c.state) { case CONN_STATE_CONNECTING: conn_error = anetGetError(conn->c.fd); @@ -662,6 +669,7 @@ static void tlsHandleEvent(tls_connection *conn, int mask) { conn->c.last_errno = conn_error; conn->c.state = CONN_STATE_ERROR; } else { + ERR_clear_error(); if (!(conn->flags & TLS_CONN_FLAG_FD_SET)) { SSL_set_fd(conn->ssl, conn->c.fd); conn->flags |= TLS_CONN_FLAG_FD_SET; @@ -690,6 +698,7 @@ static void tlsHandleEvent(tls_connection *conn, int mask) { conn->c.conn_handler = NULL; break; case CONN_STATE_ACCEPTING: + ERR_clear_error(); ret = SSL_accept(conn->ssl); if (ret <= 0) { WantIOType want = 0; @@ -747,10 +756,7 @@ static void tlsHandleEvent(tls_connection *conn, int mask) { conn->flags &= ~TLS_CONN_FLAG_READ_WANT_WRITE; if (!callHandler((connection *)conn, conn->c.read_handler)) return; } - - if (mask & AE_READABLE) { - updatePendingData(conn); - } + updatePendingData(conn); break; } @@ -799,6 +805,10 @@ static int connTLSListen(connListener *listener) { return listenToPort(listener); } +static void connTLSCloseListener(connListener *listener) { + connectionTypeTcp()->closeListener(listener); +} + static void connTLSShutdown(connection *conn_) { tls_connection *conn = (tls_connection *)conn_; @@ -941,6 +951,7 @@ static int connTLSRead(connection *conn_, void *buf, size_t buf_len) { if (conn->c.state != CONN_STATE_CONNECTED) return -1; ERR_clear_error(); ret = SSL_read(conn->ssl, buf, buf_len); + updateSSLPendingFlag(conn); return updateStateAfterSSLIO(conn, ret, 1); } @@ -967,6 +978,10 @@ static int connTLSSetReadHandler(connection *conn, ConnectionCallbackFunc func) return C_OK; } +static int isBlocking(tls_connection *conn) { + return anetIsBlock(NULL, conn->c.fd); +} + static void setBlockingTimeout(tls_connection *conn, long long timeout) { anetBlock(NULL, conn->c.fd); anetSendTimeout(NULL, conn->c.fd, timeout); @@ -992,7 +1007,7 @@ static int connTLSBlockingConnect(connection *conn_, const char *addr, int port, * which means the specified timeout will not be enforced accurately. */ SSL_set_fd(conn->ssl, conn->c.fd); setBlockingTimeout(conn, timeout); - + ERR_clear_error(); if ((ret = SSL_connect(conn->ssl)) <= 0) { conn->c.state = CONN_STATE_ERROR; return C_ERR; @@ -1005,26 +1020,31 @@ static int connTLSBlockingConnect(connection *conn_, const char *addr, int port, static ssize_t connTLSSyncWrite(connection *conn_, char *ptr, ssize_t size, long long timeout) { tls_connection *conn = (tls_connection *)conn_; - + int blocking = isBlocking(conn); setBlockingTimeout(conn, timeout); SSL_clear_mode(conn->ssl, SSL_MODE_ENABLE_PARTIAL_WRITE); ERR_clear_error(); int ret = SSL_write(conn->ssl, ptr, size); ret = updateStateAfterSSLIO(conn, ret, 0); SSL_set_mode(conn->ssl, SSL_MODE_ENABLE_PARTIAL_WRITE); - unsetBlockingTimeout(conn); + if (!blocking) { + unsetBlockingTimeout(conn); + } return ret; } static ssize_t connTLSSyncRead(connection *conn_, char *ptr, ssize_t size, long long timeout) { tls_connection *conn = (tls_connection *)conn_; - + int blocking = isBlocking(conn); setBlockingTimeout(conn, timeout); ERR_clear_error(); int ret = SSL_read(conn->ssl, ptr, size); + updateSSLPendingFlag(conn); ret = updateStateAfterSSLIO(conn, ret, 0); - unsetBlockingTimeout(conn); + if (!blocking) { + unsetBlockingTimeout(conn); + } return ret; } @@ -1033,6 +1053,7 @@ static ssize_t connTLSSyncReadLine(connection *conn_, char *ptr, ssize_t size, l tls_connection *conn = (tls_connection *)conn_; ssize_t nread = 0; + int blocking = isBlocking(conn); setBlockingTimeout(conn, timeout); size--; @@ -1041,6 +1062,7 @@ static ssize_t connTLSSyncReadLine(connection *conn_, char *ptr, ssize_t size, l ERR_clear_error(); int ret = SSL_read(conn->ssl, &c, 1); + updateSSLPendingFlag(conn); ret = updateStateAfterSSLIO(conn, ret, 0); if (ret <= 0) { nread = -1; @@ -1058,7 +1080,9 @@ static ssize_t connTLSSyncReadLine(connection *conn_, char *ptr, ssize_t size, l size--; } exit: - unsetBlockingTimeout(conn); + if (!blocking) { + unsetBlockingTimeout(conn); + } return nread; } @@ -1127,6 +1151,7 @@ static ConnectionType CT_TLS = { .addr = connTLSAddr, .is_local = connTLSIsLocal, .listen = connTLSListen, + .closeListener = connTLSCloseListener, /* create/shutdown/close connection */ .conn_create = connCreateTLS, diff --git a/src/unit/CMakeLists.txt b/src/unit/CMakeLists.txt new file mode 100644 index 0000000000..7d80c533cf --- /dev/null +++ b/src/unit/CMakeLists.txt @@ -0,0 +1,58 @@ +project(valkey-unit-tests) + +file(GLOB UNIT_TEST_SRCS "${CMAKE_CURRENT_LIST_DIR}/*.c") +set(UNIT_TEST_SRCS "${UNIT_TEST_SRCS}") + +get_valkey_server_linker_option(VALKEY_SERVER_LDFLAGS) + +# Build unit tests only +message(STATUS "Building unit tests") +list(APPEND COMPILE_DEFINITIONS "SERVER_TEST=1") +if (USE_TLS) + if (BUILD_TLS_MODULE) + # TLS as a module + list(APPEND COMPILE_DEFINITIONS "USE_OPENSSL=2") + else (BUILD_TLS_MODULE) + # Built-in TLS support + list(APPEND COMPILE_DEFINITIONS "USE_OPENSSL=1") + list(APPEND COMPILE_DEFINITIONS "BUILD_TLS_MODULE=0") + endif () +endif () + +# Build Valkey sources as a static library for the test +add_library(valkeylib STATIC ${VALKEY_SERVER_SRCS}) +target_compile_options(valkeylib PRIVATE "${COMPILE_FLAGS}") +target_compile_definitions(valkeylib PRIVATE "${COMPILE_DEFINITIONS}") + +add_executable(valkey-unit-tests ${UNIT_TEST_SRCS}) +target_compile_options(valkey-unit-tests PRIVATE "${COMPILE_FLAGS}") +target_compile_definitions(valkey-unit-tests PRIVATE "${COMPILE_DEFINITIONS}") +add_dependencies(valkey-unit-tests generate_test_files_h) + +if (UNIX AND NOT APPLE) + # Avoid duplicate symbols on non macOS + target_link_options(valkey-unit-tests PRIVATE "-Wl,--allow-multiple-definition") +endif () + +if (USE_JEMALLOC) + # Using jemalloc + target_link_libraries(valkey-unit-tests jemalloc) +endif () + +if (IS_FREEBSD) + target_link_libraries(valkey-unit-tests execinfo) +endif () + +target_link_libraries( + valkey-unit-tests + valkeylib + fpconv + lualib + hdr_histogram + hiredis + ${VALKEY_SERVER_LDFLAGS}) + +if (USE_TLS) + # Add required libraries needed for TLS + target_link_libraries(valkey-unit-tests OpenSSL::SSL hiredis_ssl) +endif () diff --git a/src/unit/README.md b/src/unit/README.md index 1ef439eaeb..93ac82f6dc 100644 --- a/src/unit/README.md +++ b/src/unit/README.md @@ -12,6 +12,7 @@ Tests flags: * UNIT_TEST_ACCURATE: Corresponds to the --accurate flag. This flag indicates the test should use extra computation to more accurately validate the tests. * UNIT_TEST_LARGE_MEMORY: Corresponds to the --large-memory flag. This flag indicates whether or not tests should use more than 100mb of memory. * UNIT_TEST_SINGLE: Corresponds to the --single flag. This flag indicates that a single test is being executed. +* UNIT_TEST_VALGRIND: Corresponds to the --valgrind flag. This flag is just a hint passed to the test to indicate that we are running it under valgrind. Tests are allowed to be passed in additional arbitrary argv/argc, which they can access from the argc and argv arguments of the test. diff --git a/src/unit/test_dict.c b/src/unit/test_dict.c index a5af4eef79..b03d252c74 100644 --- a/src/unit/test_dict.c +++ b/src/unit/test_dict.c @@ -5,19 +5,15 @@ uint64_t hashCallback(const void *key) { return dictGenHashFunction((unsigned char *)key, strlen((char *)key)); } -int compareCallback(dict *d, const void *key1, const void *key2) { +int compareCallback(const void *key1, const void *key2) { int l1, l2; - UNUSED(d); - l1 = strlen((char *)key1); l2 = strlen((char *)key2); if (l1 != l2) return 0; return memcmp(key1, key2, l1) == 0; } -void freeCallback(dict *d, void *val) { - UNUSED(d); - +void freeCallback(void *val) { zfree(val); } diff --git a/src/unit/test_files.h b/src/unit/test_files.h index cd2e0c5b92..bc3eac4222 100644 --- a/src/unit/test_files.h +++ b/src/unit/test_files.h @@ -84,6 +84,66 @@ int test_listpackBenchmarkLpValidateIntegrity(int argc, char **argv, int flags); int test_listpackBenchmarkLpCompareWithString(int argc, char **argv, int flags); int test_listpackBenchmarkLpCompareWithNumber(int argc, char **argv, int flags); int test_listpackBenchmarkFree(int argc, char **argv, int flags); +int test_backupAndUpdateClientArgv(int argc, char **argv, int flags); +int test_rewriteClientCommandArgument(int argc, char **argv, int flags); +int test_quicklistCreateList(int argc, char **argv, int flags); +int test_quicklistAddToTailOfEmptyList(int argc, char **argv, int flags); +int test_quicklistAddToHeadOfEmptyList(int argc, char **argv, int flags); +int test_quicklistAddToTail5xAtCompress(int argc, char **argv, int flags); +int test_quicklistAddToHead5xAtCompress(int argc, char **argv, int flags); +int test_quicklistAddToTail500xAtCompress(int argc, char **argv, int flags); +int test_quicklistAddToHead500xAtCompress(int argc, char **argv, int flags); +int test_quicklistRotateEmpty(int argc, char **argv, int flags); +int test_quicklistComprassionPlainNode(int argc, char **argv, int flags); +int test_quicklistNextPlainNode(int argc, char **argv, int flags); +int test_quicklistRotatePlainNode(int argc, char **argv, int flags); +int test_quicklistRotateOneValOnce(int argc, char **argv, int flags); +int test_quicklistRotate500Val5000TimesAtCompress(int argc, char **argv, int flags); +int test_quicklistPopEmpty(int argc, char **argv, int flags); +int test_quicklistPop1StringFrom1(int argc, char **argv, int flags); +int test_quicklistPopHead1NumberFrom1(int argc, char **argv, int flags); +int test_quicklistPopHead500From500(int argc, char **argv, int flags); +int test_quicklistPopHead5000From500(int argc, char **argv, int flags); +int test_quicklistIterateForwardOver500List(int argc, char **argv, int flags); +int test_quicklistIterateReverseOver500List(int argc, char **argv, int flags); +int test_quicklistInsertAfter1Element(int argc, char **argv, int flags); +int test_quicklistInsertBefore1Element(int argc, char **argv, int flags); +int test_quicklistInsertHeadWhileHeadNodeIsFull(int argc, char **argv, int flags); +int test_quicklistInsertTailWhileTailNodeIsFull(int argc, char **argv, int flags); +int test_quicklistInsertOnceInElementsWhileIteratingAtCompress(int argc, char **argv, int flags); +int test_quicklistInsertBefore250NewInMiddleOf500ElementsAtCompress(int argc, char **argv, int flags); +int test_quicklistInsertAfter250NewInMiddleOf500ElementsAtCompress(int argc, char **argv, int flags); +int test_quicklistDuplicateEmptyList(int argc, char **argv, int flags); +int test_quicklistDuplicateListOf1Element(int argc, char **argv, int flags); +int test_quicklistDuplicateListOf500(int argc, char **argv, int flags); +int test_quicklistIndex1200From500ListAtFill(int argc, char **argv, int flags); +int test_quicklistIndex12From500ListAtFill(int argc, char **argv, int flags); +int test_quicklistIndex100From500ListAtFill(int argc, char **argv, int flags); +int test_quicklistIndexTooBig1From50ListAtFill(int argc, char **argv, int flags); +int test_quicklistDeleteRangeEmptyList(int argc, char **argv, int flags); +int test_quicklistDeleteRangeOfEntireNodeInListOfOneNode(int argc, char **argv, int flags); +int test_quicklistDeleteRangeOfEntireNodeWithOverflowCounts(int argc, char **argv, int flags); +int test_quicklistDeleteMiddle100Of500List(int argc, char **argv, int flags); +int test_quicklistDeleteLessThanFillButAcrossNodes(int argc, char **argv, int flags); +int test_quicklistDeleteNegative1From500List(int argc, char **argv, int flags); +int test_quicklistDeleteNegative1From500ListWithOverflowCounts(int argc, char **argv, int flags); +int test_quicklistDeleteNegative100From500List(int argc, char **argv, int flags); +int test_quicklistDelete10Count5From50List(int argc, char **argv, int flags); +int test_quicklistNumbersOnlyListRead(int argc, char **argv, int flags); +int test_quicklistNumbersLargerListRead(int argc, char **argv, int flags); +int test_quicklistNumbersLargerListReadB(int argc, char **argv, int flags); +int test_quicklistLremTestAtCompress(int argc, char **argv, int flags); +int test_quicklistIterateReverseDeleteAtCompress(int argc, char **argv, int flags); +int test_quicklistIteratorAtIndexTestAtCompress(int argc, char **argv, int flags); +int test_quicklistLtrimTestAAtCompress(int argc, char **argv, int flags); +int test_quicklistLtrimTestBAtCompress(int argc, char **argv, int flags); +int test_quicklistLtrimTestCAtCompress(int argc, char **argv, int flags); +int test_quicklistLtrimTestDAtCompress(int argc, char **argv, int flags); +int test_quicklistVerifySpecificCompressionOfInteriorNodes(int argc, char **argv, int flags); +int test_quicklistBookmarkGetUpdatedToNextItem(int argc, char **argv, int flags); +int test_quicklistBookmarkLimit(int argc, char **argv, int flags); +int test_quicklistCompressAndDecompressQuicklistListpackNode(int argc, char **argv, int flags); +int test_quicklistCompressAndDecomressQuicklistPlainNodeLargeThanUINT32MAX(int argc, char **argv, int flags); int test_raxRandomWalk(int argc, char **argv, int flags); int test_raxIteratorUnitTests(int argc, char **argv, int flags); int test_raxTryInsertUnitTests(int argc, char **argv, int flags); @@ -99,6 +159,7 @@ int test_raxFuzz(int argc, char **argv, int flags); int test_sds(int argc, char **argv, int flags); int test_typesAndAllocSize(int argc, char **argv, int flags); int test_sdsHeaderSizes(int argc, char **argv, int flags); +int test_sdssplitargs(int argc, char **argv, int flags); int test_sha1(int argc, char **argv, int flags); int test_string2ll(int argc, char **argv, int flags); int test_string2l(int argc, char **argv, int flags); @@ -107,6 +168,7 @@ int test_ld2string(int argc, char **argv, int flags); int test_fixedpoint_d2string(int argc, char **argv, int flags); int test_version2num(int argc, char **argv, int flags); int test_reclaimFilePageCache(int argc, char **argv, int flags); +int test_valkey_strtod(int argc, char **argv, int flags); int test_ziplistCreateIntList(int argc, char **argv, int flags); int test_ziplistPop(int argc, char **argv, int flags); int test_ziplistGetElementAtIndex3(int argc, char **argv, int flags); @@ -156,10 +218,13 @@ unitTest __test_endianconv_c[] = {{"test_endianconv", test_endianconv}, {NULL, N unitTest __test_intset_c[] = {{"test_intsetValueEncodings", test_intsetValueEncodings}, {"test_intsetBasicAdding", test_intsetBasicAdding}, {"test_intsetLargeNumberRandomAdd", test_intsetLargeNumberRandomAdd}, {"test_intsetUpgradeFromint16Toint32", test_intsetUpgradeFromint16Toint32}, {"test_intsetUpgradeFromint16Toint64", test_intsetUpgradeFromint16Toint64}, {"test_intsetUpgradeFromint32Toint64", test_intsetUpgradeFromint32Toint64}, {"test_intsetStressLookups", test_intsetStressLookups}, {"test_intsetStressAddDelete", test_intsetStressAddDelete}, {NULL, NULL}}; unitTest __test_kvstore_c[] = {{"test_kvstoreAdd16Keys", test_kvstoreAdd16Keys}, {"test_kvstoreIteratorRemoveAllKeysNoDeleteEmptyDict", test_kvstoreIteratorRemoveAllKeysNoDeleteEmptyDict}, {"test_kvstoreIteratorRemoveAllKeysDeleteEmptyDict", test_kvstoreIteratorRemoveAllKeysDeleteEmptyDict}, {"test_kvstoreDictIteratorRemoveAllKeysNoDeleteEmptyDict", test_kvstoreDictIteratorRemoveAllKeysNoDeleteEmptyDict}, {"test_kvstoreDictIteratorRemoveAllKeysDeleteEmptyDict", test_kvstoreDictIteratorRemoveAllKeysDeleteEmptyDict}, {NULL, NULL}}; unitTest __test_listpack_c[] = {{"test_listpackCreateIntList", test_listpackCreateIntList}, {"test_listpackCreateList", test_listpackCreateList}, {"test_listpackLpPrepend", test_listpackLpPrepend}, {"test_listpackLpPrependInteger", test_listpackLpPrependInteger}, {"test_listpackGetELementAtIndex", test_listpackGetELementAtIndex}, {"test_listpackPop", test_listpackPop}, {"test_listpackGetELementAtIndex2", test_listpackGetELementAtIndex2}, {"test_listpackIterate0toEnd", test_listpackIterate0toEnd}, {"test_listpackIterate1toEnd", test_listpackIterate1toEnd}, {"test_listpackIterate2toEnd", test_listpackIterate2toEnd}, {"test_listpackIterateBackToFront", test_listpackIterateBackToFront}, {"test_listpackIterateBackToFrontWithDelete", test_listpackIterateBackToFrontWithDelete}, {"test_listpackDeleteWhenNumIsMinusOne", test_listpackDeleteWhenNumIsMinusOne}, {"test_listpackDeleteWithNegativeIndex", test_listpackDeleteWithNegativeIndex}, {"test_listpackDeleteInclusiveRange0_0", test_listpackDeleteInclusiveRange0_0}, {"test_listpackDeleteInclusiveRange0_1", test_listpackDeleteInclusiveRange0_1}, {"test_listpackDeleteInclusiveRange1_2", test_listpackDeleteInclusiveRange1_2}, {"test_listpackDeleteWitStartIndexOutOfRange", test_listpackDeleteWitStartIndexOutOfRange}, {"test_listpackDeleteWitNumOverflow", test_listpackDeleteWitNumOverflow}, {"test_listpackBatchDelete", test_listpackBatchDelete}, {"test_listpackDeleteFooWhileIterating", test_listpackDeleteFooWhileIterating}, {"test_listpackReplaceWithSameSize", test_listpackReplaceWithSameSize}, {"test_listpackReplaceWithDifferentSize", test_listpackReplaceWithDifferentSize}, {"test_listpackRegressionGt255Bytes", test_listpackRegressionGt255Bytes}, {"test_listpackCreateLongListAndCheckIndices", test_listpackCreateLongListAndCheckIndices}, {"test_listpackCompareStrsWithLpEntries", test_listpackCompareStrsWithLpEntries}, {"test_listpackLpMergeEmptyLps", test_listpackLpMergeEmptyLps}, {"test_listpackLpMergeLp1Larger", test_listpackLpMergeLp1Larger}, {"test_listpackLpMergeLp2Larger", test_listpackLpMergeLp2Larger}, {"test_listpackLpNextRandom", test_listpackLpNextRandom}, {"test_listpackLpNextRandomCC", test_listpackLpNextRandomCC}, {"test_listpackRandomPairWithOneElement", test_listpackRandomPairWithOneElement}, {"test_listpackRandomPairWithManyElements", test_listpackRandomPairWithManyElements}, {"test_listpackRandomPairsWithOneElement", test_listpackRandomPairsWithOneElement}, {"test_listpackRandomPairsWithManyElements", test_listpackRandomPairsWithManyElements}, {"test_listpackRandomPairsUniqueWithOneElement", test_listpackRandomPairsUniqueWithOneElement}, {"test_listpackRandomPairsUniqueWithManyElements", test_listpackRandomPairsUniqueWithManyElements}, {"test_listpackPushVariousEncodings", test_listpackPushVariousEncodings}, {"test_listpackLpFind", test_listpackLpFind}, {"test_listpackLpValidateIntegrity", test_listpackLpValidateIntegrity}, {"test_listpackNumberOfElementsExceedsLP_HDR_NUMELE_UNKNOWN", test_listpackNumberOfElementsExceedsLP_HDR_NUMELE_UNKNOWN}, {"test_listpackStressWithRandom", test_listpackStressWithRandom}, {"test_listpackSTressWithVariableSize", test_listpackSTressWithVariableSize}, {"test_listpackBenchmarkInit", test_listpackBenchmarkInit}, {"test_listpackBenchmarkLpAppend", test_listpackBenchmarkLpAppend}, {"test_listpackBenchmarkLpFindString", test_listpackBenchmarkLpFindString}, {"test_listpackBenchmarkLpFindNumber", test_listpackBenchmarkLpFindNumber}, {"test_listpackBenchmarkLpSeek", test_listpackBenchmarkLpSeek}, {"test_listpackBenchmarkLpValidateIntegrity", test_listpackBenchmarkLpValidateIntegrity}, {"test_listpackBenchmarkLpCompareWithString", test_listpackBenchmarkLpCompareWithString}, {"test_listpackBenchmarkLpCompareWithNumber", test_listpackBenchmarkLpCompareWithNumber}, {"test_listpackBenchmarkFree", test_listpackBenchmarkFree}, {NULL, NULL}}; +unitTest __test_networking_c[] = {{"test_backupAndUpdateClientArgv", test_backupAndUpdateClientArgv}, {"test_rewriteClientCommandArgument", test_rewriteClientCommandArgument}, {NULL, NULL}}; +unitTest __test_quicklist_c[] = {{"test_quicklistCreateList", test_quicklistCreateList}, {"test_quicklistAddToTailOfEmptyList", test_quicklistAddToTailOfEmptyList}, {"test_quicklistAddToHeadOfEmptyList", test_quicklistAddToHeadOfEmptyList}, {"test_quicklistAddToTail5xAtCompress", test_quicklistAddToTail5xAtCompress}, {"test_quicklistAddToHead5xAtCompress", test_quicklistAddToHead5xAtCompress}, {"test_quicklistAddToTail500xAtCompress", test_quicklistAddToTail500xAtCompress}, {"test_quicklistAddToHead500xAtCompress", test_quicklistAddToHead500xAtCompress}, {"test_quicklistRotateEmpty", test_quicklistRotateEmpty}, {"test_quicklistComprassionPlainNode", test_quicklistComprassionPlainNode}, {"test_quicklistNextPlainNode", test_quicklistNextPlainNode}, {"test_quicklistRotatePlainNode", test_quicklistRotatePlainNode}, {"test_quicklistRotateOneValOnce", test_quicklistRotateOneValOnce}, {"test_quicklistRotate500Val5000TimesAtCompress", test_quicklistRotate500Val5000TimesAtCompress}, {"test_quicklistPopEmpty", test_quicklistPopEmpty}, {"test_quicklistPop1StringFrom1", test_quicklistPop1StringFrom1}, {"test_quicklistPopHead1NumberFrom1", test_quicklistPopHead1NumberFrom1}, {"test_quicklistPopHead500From500", test_quicklistPopHead500From500}, {"test_quicklistPopHead5000From500", test_quicklistPopHead5000From500}, {"test_quicklistIterateForwardOver500List", test_quicklistIterateForwardOver500List}, {"test_quicklistIterateReverseOver500List", test_quicklistIterateReverseOver500List}, {"test_quicklistInsertAfter1Element", test_quicklistInsertAfter1Element}, {"test_quicklistInsertBefore1Element", test_quicklistInsertBefore1Element}, {"test_quicklistInsertHeadWhileHeadNodeIsFull", test_quicklistInsertHeadWhileHeadNodeIsFull}, {"test_quicklistInsertTailWhileTailNodeIsFull", test_quicklistInsertTailWhileTailNodeIsFull}, {"test_quicklistInsertOnceInElementsWhileIteratingAtCompress", test_quicklistInsertOnceInElementsWhileIteratingAtCompress}, {"test_quicklistInsertBefore250NewInMiddleOf500ElementsAtCompress", test_quicklistInsertBefore250NewInMiddleOf500ElementsAtCompress}, {"test_quicklistInsertAfter250NewInMiddleOf500ElementsAtCompress", test_quicklistInsertAfter250NewInMiddleOf500ElementsAtCompress}, {"test_quicklistDuplicateEmptyList", test_quicklistDuplicateEmptyList}, {"test_quicklistDuplicateListOf1Element", test_quicklistDuplicateListOf1Element}, {"test_quicklistDuplicateListOf500", test_quicklistDuplicateListOf500}, {"test_quicklistIndex1200From500ListAtFill", test_quicklistIndex1200From500ListAtFill}, {"test_quicklistIndex12From500ListAtFill", test_quicklistIndex12From500ListAtFill}, {"test_quicklistIndex100From500ListAtFill", test_quicklistIndex100From500ListAtFill}, {"test_quicklistIndexTooBig1From50ListAtFill", test_quicklistIndexTooBig1From50ListAtFill}, {"test_quicklistDeleteRangeEmptyList", test_quicklistDeleteRangeEmptyList}, {"test_quicklistDeleteRangeOfEntireNodeInListOfOneNode", test_quicklistDeleteRangeOfEntireNodeInListOfOneNode}, {"test_quicklistDeleteRangeOfEntireNodeWithOverflowCounts", test_quicklistDeleteRangeOfEntireNodeWithOverflowCounts}, {"test_quicklistDeleteMiddle100Of500List", test_quicklistDeleteMiddle100Of500List}, {"test_quicklistDeleteLessThanFillButAcrossNodes", test_quicklistDeleteLessThanFillButAcrossNodes}, {"test_quicklistDeleteNegative1From500List", test_quicklistDeleteNegative1From500List}, {"test_quicklistDeleteNegative1From500ListWithOverflowCounts", test_quicklistDeleteNegative1From500ListWithOverflowCounts}, {"test_quicklistDeleteNegative100From500List", test_quicklistDeleteNegative100From500List}, {"test_quicklistDelete10Count5From50List", test_quicklistDelete10Count5From50List}, {"test_quicklistNumbersOnlyListRead", test_quicklistNumbersOnlyListRead}, {"test_quicklistNumbersLargerListRead", test_quicklistNumbersLargerListRead}, {"test_quicklistNumbersLargerListReadB", test_quicklistNumbersLargerListReadB}, {"test_quicklistLremTestAtCompress", test_quicklistLremTestAtCompress}, {"test_quicklistIterateReverseDeleteAtCompress", test_quicklistIterateReverseDeleteAtCompress}, {"test_quicklistIteratorAtIndexTestAtCompress", test_quicklistIteratorAtIndexTestAtCompress}, {"test_quicklistLtrimTestAAtCompress", test_quicklistLtrimTestAAtCompress}, {"test_quicklistLtrimTestBAtCompress", test_quicklistLtrimTestBAtCompress}, {"test_quicklistLtrimTestCAtCompress", test_quicklistLtrimTestCAtCompress}, {"test_quicklistLtrimTestDAtCompress", test_quicklistLtrimTestDAtCompress}, {"test_quicklistVerifySpecificCompressionOfInteriorNodes", test_quicklistVerifySpecificCompressionOfInteriorNodes}, {"test_quicklistBookmarkGetUpdatedToNextItem", test_quicklistBookmarkGetUpdatedToNextItem}, {"test_quicklistBookmarkLimit", test_quicklistBookmarkLimit}, {"test_quicklistCompressAndDecompressQuicklistListpackNode", test_quicklistCompressAndDecompressQuicklistListpackNode}, {"test_quicklistCompressAndDecomressQuicklistPlainNodeLargeThanUINT32MAX", test_quicklistCompressAndDecomressQuicklistPlainNodeLargeThanUINT32MAX}, {NULL, NULL}}; unitTest __test_rax_c[] = {{"test_raxRandomWalk", test_raxRandomWalk}, {"test_raxIteratorUnitTests", test_raxIteratorUnitTests}, {"test_raxTryInsertUnitTests", test_raxTryInsertUnitTests}, {"test_raxRegressionTest1", test_raxRegressionTest1}, {"test_raxRegressionTest2", test_raxRegressionTest2}, {"test_raxRegressionTest3", test_raxRegressionTest3}, {"test_raxRegressionTest4", test_raxRegressionTest4}, {"test_raxRegressionTest5", test_raxRegressionTest5}, {"test_raxRegressionTest6", test_raxRegressionTest6}, {"test_raxBenchmark", test_raxBenchmark}, {"test_raxHugeKey", test_raxHugeKey}, {"test_raxFuzz", test_raxFuzz}, {NULL, NULL}}; -unitTest __test_sds_c[] = {{"test_sds", test_sds}, {"test_typesAndAllocSize", test_typesAndAllocSize}, {"test_sdsHeaderSizes", test_sdsHeaderSizes}, {NULL, NULL}}; +unitTest __test_sds_c[] = {{"test_sds", test_sds}, {"test_typesAndAllocSize", test_typesAndAllocSize}, {"test_sdsHeaderSizes", test_sdsHeaderSizes}, {"test_sdssplitargs", test_sdssplitargs}, {NULL, NULL}}; unitTest __test_sha1_c[] = {{"test_sha1", test_sha1}, {NULL, NULL}}; unitTest __test_util_c[] = {{"test_string2ll", test_string2ll}, {"test_string2l", test_string2l}, {"test_ll2string", test_ll2string}, {"test_ld2string", test_ld2string}, {"test_fixedpoint_d2string", test_fixedpoint_d2string}, {"test_version2num", test_version2num}, {"test_reclaimFilePageCache", test_reclaimFilePageCache}, {NULL, NULL}}; +unitTest __test_valkey_strtod_c[] = {{"test_valkey_strtod", test_valkey_strtod}, {NULL, NULL}}; unitTest __test_ziplist_c[] = {{"test_ziplistCreateIntList", test_ziplistCreateIntList}, {"test_ziplistPop", test_ziplistPop}, {"test_ziplistGetElementAtIndex3", test_ziplistGetElementAtIndex3}, {"test_ziplistGetElementOutOfRange", test_ziplistGetElementOutOfRange}, {"test_ziplistGetLastElement", test_ziplistGetLastElement}, {"test_ziplistGetFirstElement", test_ziplistGetFirstElement}, {"test_ziplistGetElementOutOfRangeReverse", test_ziplistGetElementOutOfRangeReverse}, {"test_ziplistIterateThroughFullList", test_ziplistIterateThroughFullList}, {"test_ziplistIterateThroughListFrom1ToEnd", test_ziplistIterateThroughListFrom1ToEnd}, {"test_ziplistIterateThroughListFrom2ToEnd", test_ziplistIterateThroughListFrom2ToEnd}, {"test_ziplistIterateThroughStartOutOfRange", test_ziplistIterateThroughStartOutOfRange}, {"test_ziplistIterateBackToFront", test_ziplistIterateBackToFront}, {"test_ziplistIterateBackToFrontDeletingAllItems", test_ziplistIterateBackToFrontDeletingAllItems}, {"test_ziplistDeleteInclusiveRange0To0", test_ziplistDeleteInclusiveRange0To0}, {"test_ziplistDeleteInclusiveRange0To1", test_ziplistDeleteInclusiveRange0To1}, {"test_ziplistDeleteInclusiveRange1To2", test_ziplistDeleteInclusiveRange1To2}, {"test_ziplistDeleteWithStartIndexOutOfRange", test_ziplistDeleteWithStartIndexOutOfRange}, {"test_ziplistDeleteWithNumOverflow", test_ziplistDeleteWithNumOverflow}, {"test_ziplistDeleteFooWhileIterating", test_ziplistDeleteFooWhileIterating}, {"test_ziplistReplaceWithSameSize", test_ziplistReplaceWithSameSize}, {"test_ziplistReplaceWithDifferentSize", test_ziplistReplaceWithDifferentSize}, {"test_ziplistRegressionTestForOver255ByteStrings", test_ziplistRegressionTestForOver255ByteStrings}, {"test_ziplistRegressionTestDeleteNextToLastEntries", test_ziplistRegressionTestDeleteNextToLastEntries}, {"test_ziplistCreateLongListAndCheckIndices", test_ziplistCreateLongListAndCheckIndices}, {"test_ziplistCompareStringWithZiplistEntries", test_ziplistCompareStringWithZiplistEntries}, {"test_ziplistMergeTest", test_ziplistMergeTest}, {"test_ziplistStressWithRandomPayloadsOfDifferentEncoding", test_ziplistStressWithRandomPayloadsOfDifferentEncoding}, {"test_ziplistCascadeUpdateEdgeCases", test_ziplistCascadeUpdateEdgeCases}, {"test_ziplistInsertEdgeCase", test_ziplistInsertEdgeCase}, {"test_ziplistStressWithVariableSize", test_ziplistStressWithVariableSize}, {"test_BenchmarkziplistFind", test_BenchmarkziplistFind}, {"test_BenchmarkziplistIndex", test_BenchmarkziplistIndex}, {"test_BenchmarkziplistValidateIntegrity", test_BenchmarkziplistValidateIntegrity}, {"test_BenchmarkziplistCompareWithString", test_BenchmarkziplistCompareWithString}, {"test_BenchmarkziplistCompareWithNumber", test_BenchmarkziplistCompareWithNumber}, {"test_ziplistStress__ziplistCascadeUpdate", test_ziplistStress__ziplistCascadeUpdate}, {NULL, NULL}}; unitTest __test_zipmap_c[] = {{"test_zipmapIterateWithLargeKey", test_zipmapIterateWithLargeKey}, {"test_zipmapIterateThroughElements", test_zipmapIterateThroughElements}, {NULL, NULL}}; unitTest __test_zmalloc_c[] = {{"test_zmallocInitialUsedMemory", test_zmallocInitialUsedMemory}, {"test_zmallocAllocReallocCallocAndFree", test_zmallocAllocReallocCallocAndFree}, {"test_zmallocAllocZeroByteAndFree", test_zmallocAllocZeroByteAndFree}, {NULL, NULL}}; @@ -175,10 +240,13 @@ struct unitTestSuite { {"test_intset.c", __test_intset_c}, {"test_kvstore.c", __test_kvstore_c}, {"test_listpack.c", __test_listpack_c}, + {"test_networking.c", __test_networking_c}, + {"test_quicklist.c", __test_quicklist_c}, {"test_rax.c", __test_rax_c}, {"test_sds.c", __test_sds_c}, {"test_sha1.c", __test_sha1_c}, {"test_util.c", __test_util_c}, + {"test_valkey_strtod.c", __test_valkey_strtod_c}, {"test_ziplist.c", __test_ziplist_c}, {"test_zipmap.c", __test_zipmap_c}, {"test_zmalloc.c", __test_zmalloc_c}, diff --git a/src/unit/test_help.h b/src/unit/test_help.h index 804a7e3449..51e77d19d3 100644 --- a/src/unit/test_help.h +++ b/src/unit/test_help.h @@ -18,10 +18,12 @@ /* The flags are the following: * --accurate: Runs tests with more iterations. * --large-memory: Enables tests that consume more than 100mb. - * --single: A flag to indicate a specific test file was executed. */ + * --single: A flag to indicate a specific test file was executed. + * --valgrind: Runs tests with valgrind. */ #define UNIT_TEST_ACCURATE (1 << 0) #define UNIT_TEST_LARGE_MEMORY (1 << 1) #define UNIT_TEST_SINGLE (1 << 2) +#define UNIT_TEST_VALGRIND (1 << 3) #define KRED "\33[31m" #define KGRN "\33[32m" diff --git a/src/unit/test_kvstore.c b/src/unit/test_kvstore.c index b3eff7d132..062b9f32fc 100644 --- a/src/unit/test_kvstore.c +++ b/src/unit/test_kvstore.c @@ -5,8 +5,7 @@ uint64_t hashTestCallback(const void *key) { return dictGenHashFunction((unsigned char *)key, strlen((char *)key)); } -void freeTestCallback(dict *d, void *val) { - UNUSED(d); +void freeTestCallback(void *val) { zfree(val); } diff --git a/src/unit/test_main.c b/src/unit/test_main.c index 277d1b42c1..1b7cd8c96d 100644 --- a/src/unit/test_main.c +++ b/src/unit/test_main.c @@ -49,6 +49,8 @@ int main(int argc, char **argv) { else if (!strcasecmp(arg, "--single") && (j + 1 < argc)) { flags |= UNIT_TEST_SINGLE; file = argv[j + 1]; + } else if (!strcasecmp(arg, "--valgrind")) { + flags |= UNIT_TEST_VALGRIND; } } diff --git a/src/unit/test_networking.c b/src/unit/test_networking.c new file mode 100644 index 0000000000..566583bcc5 --- /dev/null +++ b/src/unit/test_networking.c @@ -0,0 +1,131 @@ +#include "../networking.c" +#include "../server.c" +#include "test_help.h" + +#include + +int test_backupAndUpdateClientArgv(int argc, char **argv, int flags) { + UNUSED(argc); + UNUSED(argv); + UNUSED(flags); + + client *c = zmalloc(sizeof(client)); + + /* Test 1: Initial backup of arguments */ + c->argc = 2; + robj **initial_argv = zmalloc(sizeof(robj *) * 2); + c->argv = initial_argv; + c->argv[0] = createObject(OBJ_STRING, sdscatfmt(sdsempty(), "test")); + c->argv[1] = createObject(OBJ_STRING, sdscatfmt(sdsempty(), "test2")); + c->original_argv = NULL; + + backupAndUpdateClientArgv(c, 3, NULL); + + TEST_ASSERT(c->argv != initial_argv); + TEST_ASSERT(c->original_argv == initial_argv); + TEST_ASSERT(c->original_argc == 2); + TEST_ASSERT(c->argc == 3); + TEST_ASSERT(c->argv_len == 3); + TEST_ASSERT(c->argv[0]->refcount == 2); + TEST_ASSERT(c->argv[1]->refcount == 2); + TEST_ASSERT(c->argv[2] == NULL); + + /* Test 2: Direct argv replacement */ + robj **new_argv = zmalloc(sizeof(robj *) * 2); + new_argv[0] = createObject(OBJ_STRING, sdscatfmt(sdsempty(), "test")); + new_argv[1] = createObject(OBJ_STRING, sdscatfmt(sdsempty(), "test2")); + + backupAndUpdateClientArgv(c, 2, new_argv); + + TEST_ASSERT(c->argv == new_argv); + TEST_ASSERT(c->argc == 2); + TEST_ASSERT(c->argv_len == 2); + TEST_ASSERT(c->original_argv != c->argv); + TEST_ASSERT(c->original_argv == initial_argv); + TEST_ASSERT(c->original_argc == 2); + TEST_ASSERT(c->original_argv[0]->refcount == 1); + TEST_ASSERT(c->original_argv[1]->refcount == 1); + + /* Test 3: Expanding argc */ + backupAndUpdateClientArgv(c, 4, NULL); + + TEST_ASSERT(c->argc == 4); + TEST_ASSERT(c->argv_len == 4); + TEST_ASSERT(c->argv[0] != NULL); + TEST_ASSERT(c->argv[1] != NULL); + TEST_ASSERT(c->argv[2] == NULL); + TEST_ASSERT(c->argv[3] == NULL); + TEST_ASSERT(c->original_argv == initial_argv); + + /* Cleanup */ + for (int i = 0; i < c->original_argc; i++) { + decrRefCount(c->original_argv[i]); + } + zfree(c->original_argv); + + for (int i = 0; i < c->argc; i++) { + if (c->argv[i]) decrRefCount(c->argv[i]); + } + zfree(c->argv); + zfree(c); + + return 0; +} + +int test_rewriteClientCommandArgument(int argc, char **argv, int flags) { + UNUSED(argc); + UNUSED(argv); + UNUSED(flags); + + client *c = zmalloc(sizeof(client)); + c->argc = 3; + robj **initial_argv = zmalloc(sizeof(robj *) * 3); + c->argv = initial_argv; + c->original_argv = NULL; + c->argv_len_sum = 0; + + /* Initialize client with command "SET key value" */ + c->argv[0] = createStringObject("SET", 3); + robj *original_key = createStringObject("key", 3); + c->argv[1] = original_key; + c->argv[2] = createStringObject("value", 5); + c->argv_len_sum = 11; // 3 + 3 + 5 + + /* Test 1: Rewrite existing argument */ + robj *newval = createStringObject("newkey", 6); + rewriteClientCommandArgument(c, 1, newval); + + TEST_ASSERT(c->argv[1] == newval); + TEST_ASSERT(c->argv[1]->refcount == 2); + TEST_ASSERT(c->argv_len_sum == 14); // 3 + 6 + 5 + TEST_ASSERT(c->original_argv == initial_argv); + TEST_ASSERT(c->original_argv[1] == original_key); + TEST_ASSERT(c->original_argv[1]->refcount == 1); + + /* Test 3: Extend argument vector */ + robj *extraval = createStringObject("extra", 5); + rewriteClientCommandArgument(c, 3, extraval); + + TEST_ASSERT(c->argc == 4); + TEST_ASSERT(c->argv[3] == extraval); + TEST_ASSERT(c->argv_len_sum == 19); // 3 + 6 + 5 + 5 + TEST_ASSERT(c->original_argv == initial_argv); + + /* Cleanup */ + for (int i = 0; i < c->argc; i++) { + if (c->argv[i]) decrRefCount(c->argv[i]); + } + zfree(c->argv); + + for (int i = 0; i < c->original_argc; i++) { + if (c->original_argv[i]) decrRefCount(c->original_argv[i]); + } + zfree(c->original_argv); + + decrRefCount(newval); + decrRefCount(extraval); + + zfree(c); + + return 0; +} diff --git a/src/unit/test_quicklist.c b/src/unit/test_quicklist.c new file mode 100644 index 0000000000..6addb33f41 --- /dev/null +++ b/src/unit/test_quicklist.c @@ -0,0 +1,2300 @@ +#include +#include +#include +#include "test_help.h" +#include +#include + +#include "../zmalloc.h" +#include "../listpack.h" +#include "../quicklist.c" + +static int options[] = {0, 1, 2, 3, 4, 5, 6, 10}; +static int option_count = 8; + +static int fills[] = {-5, -4, -3, -2, -1, 0, + 1, 2, 32, 66, 128, 999}; +static int fill_count = 12; +static long long runtime[8]; +static unsigned int err = 0; + +/*----------------------------------------------------------------------------- + * Unit Function + *----------------------------------------------------------------------------*/ +/* Return the UNIX time in microseconds */ +static long long ustime(void) { + struct timeval tv; + long long ust; + + gettimeofday(&tv, NULL); + ust = ((long long)tv.tv_sec) * 1000000; + ust += tv.tv_usec; + return ust; +} + +/* Return the UNIX time in milliseconds */ +static long long mstime(void) { + return ustime() / 1000; +} + +/* Generate new string concatenating integer i against string 'prefix' */ +static char *genstr(char *prefix, int i) { + static char result[64] = {0}; + snprintf(result, sizeof(result), "%s%d", prefix, i); + return result; +} + +__attribute__((unused)) static void randstring(unsigned char *target, size_t sz) { + size_t p = 0; + int minval, maxval; + switch (rand() % 3) { + case 0: + minval = 'a'; + maxval = 'z'; + break; + case 1: + minval = '0'; + maxval = '9'; + break; + case 2: + minval = 'A'; + maxval = 'Z'; + break; + default: + abort(); + } + + while (p < sz) + target[p++] = minval + rand() % (maxval - minval + 1); +} + +#define TEST(name) printf("test — %s\n", name); + +#define QL_TEST_VERBOSE 0 +static void ql_info(quicklist *ql) { +#if QL_TEST_VERBOSE + TEST_PRINT_INFO("Container length: %lu\n", ql->len); + TEST_PRINT_INFO("Container size: %lu\n", ql->count); + if (ql->head) + TEST_PRINT_INFO("\t(zsize head: %lu)\n", lpLength(ql->head->entry)); + if (ql->tail) + TEST_PRINT_INFO("\t(zsize tail: %lu)\n", lpLength(ql->tail->entry)); +#else + UNUSED(ql); +#endif +} + +/* Iterate over an entire quicklist. + * Print the list if 'print' == 1. + * + * Returns physical count of elements found by iterating over the list. */ +static int _itrprintr(quicklist *ql, int print, int forward) { + quicklistIter *iter = + quicklistGetIterator(ql, forward ? AL_START_HEAD : AL_START_TAIL); + quicklistEntry entry; + int i = 0; + int p = 0; + quicklistNode *prev = NULL; + while (quicklistNext(iter, &entry)) { + if (entry.node != prev) { + /* Count the number of list nodes too */ + p++; + prev = entry.node; + } + if (print) { + int size = (entry.sz > (1 << 20)) ? 1 << 20 : entry.sz; + TEST_PRINT_INFO("[%3d (%2d)]: [%.*s] (%lld)\n", i, p, size, + (char *)entry.value, entry.longval); + } + i++; + } + quicklistReleaseIterator(iter); + return i; +} + +static int itrprintr(quicklist *ql, int print) { + return _itrprintr(ql, print, 1); +} + +static int itrprintr_rev(quicklist *ql, int print) { + return _itrprintr(ql, print, 0); +} + +#define ql_verify(a, b, c, d, e) \ + do { \ + err += _ql_verify((a), (b), (c), (d), (e)); \ + } while (0) + +static int _ql_verify_compress(quicklist *ql) { + int errors = 0; + if (quicklistAllowsCompression(ql)) { + quicklistNode *node = ql->head; + unsigned int low_raw = ql->compress; + unsigned int high_raw = ql->len - ql->compress; + + for (unsigned int at = 0; at < ql->len; at++, node = node->next) { + if (node && (at < low_raw || at >= high_raw)) { + if (node->encoding != QUICKLIST_NODE_ENCODING_RAW) { + TEST_PRINT_INFO("Incorrect compression: node %d is " + "compressed at depth %d ((%u, %u); total " + "nodes: %lu; size: %zu; recompress: %d)", + at, ql->compress, low_raw, high_raw, ql->len, node->sz, + node->recompress); + errors++; + } + } else { + if (node->encoding != QUICKLIST_NODE_ENCODING_LZF && + !node->attempted_compress) { + TEST_PRINT_INFO("Incorrect non-compression: node %d is NOT " + "compressed at depth %d ((%u, %u); total " + "nodes: %lu; size: %zu; recompress: %d; attempted: %d)", + at, ql->compress, low_raw, high_raw, ql->len, node->sz, + node->recompress, node->attempted_compress); + errors++; + } + } + } + } + return errors; +} + +/* Verify list metadata matches physical list contents. */ +static int _ql_verify(quicklist *ql, uint32_t len, uint32_t count, uint32_t head_count, uint32_t tail_count) { + int errors = 0; + + ql_info(ql); + if (len != ql->len) { + TEST_PRINT_INFO("quicklist length wrong: expected %d, got %lu", len, ql->len); + errors++; + } + + if (count != ql->count) { + TEST_PRINT_INFO("quicklist count wrong: expected %d, got %lu", count, ql->count); + errors++; + } + + int loopr = itrprintr(ql, 0); + if (loopr != (int)ql->count) { + TEST_PRINT_INFO("quicklist cached count not match actual count: expected %lu, got " + "%d", + ql->count, loopr); + errors++; + } + + int rloopr = itrprintr_rev(ql, 0); + if (loopr != rloopr) { + TEST_PRINT_INFO("quicklist has different forward count than reverse count! " + "Forward count is %d, reverse count is %d.", + loopr, rloopr); + errors++; + } + + if (ql->len == 0 && !errors) { + return errors; + } + + if (ql->head && head_count != ql->head->count && + head_count != lpLength(ql->head->entry)) { + TEST_PRINT_INFO("quicklist head count wrong: expected %d, " + "got cached %d vs. actual %lu", + head_count, ql->head->count, lpLength(ql->head->entry)); + errors++; + } + + if (ql->tail && tail_count != ql->tail->count && + tail_count != lpLength(ql->tail->entry)) { + TEST_PRINT_INFO("quicklist tail count wrong: expected %d, " + "got cached %u vs. actual %lu", + tail_count, ql->tail->count, lpLength(ql->tail->entry)); + errors++; + } + + errors += _ql_verify_compress(ql); + return errors; +} + +/* Release iterator and verify compress correctly. */ +static void ql_release_iterator(quicklistIter *iter) { + quicklist *ql = NULL; + if (iter) ql = iter->quicklist; + quicklistReleaseIterator(iter); + if (ql && _ql_verify_compress(ql)) { + abort(); + } +} + +/*----------------------------------------------------------------------------- + * Quicklist Unit Test + *----------------------------------------------------------------------------*/ +int test_quicklistCreateList(int argc, char **argv, int flags) { + UNUSED(argc); + UNUSED(argv); + UNUSED(flags); + TEST("create list"); + for (int _i = 0; _i < option_count; _i++) { + long long start = mstime(); + quicklist *ql = quicklistNew(-2, options[_i]); + ql_verify(ql, 0, 0, 0, 0); + quicklistRelease(ql); + long long stop = mstime(); + runtime[_i] += stop - start; + } + TEST_ASSERT(err == 0); + return 0; +} + +int test_quicklistAddToTailOfEmptyList(int argc, char **argv, int flags) { + UNUSED(argc); + UNUSED(argv); + UNUSED(flags); + quicklistIter *iter; + TEST("add to tail of empty list"); + for (int _i = 0; _i < option_count; _i++) { + long long start = mstime(); + quicklist *ql = quicklistNew(-2, options[_i]); + quicklistPushTail(ql, "hello", 6); + /* 1 for head and 1 for tail because 1 node = head = tail */ + ql_verify(ql, 1, 1, 1, 1); + quicklistRelease(ql); + long long stop = mstime(); + runtime[_i] += stop - start; + } + UNUSED(iter); + TEST_ASSERT(err == 0); + return 0; +} + +int test_quicklistAddToHeadOfEmptyList(int argc, char **argv, int flags) { + UNUSED(argc); + UNUSED(argv); + UNUSED(flags); + quicklistIter *iter; + TEST("add to head of empty list"); + for (int _i = 0; _i < option_count; _i++) { + long long start = mstime(); + quicklist *ql = quicklistNew(-2, options[_i]); + quicklistPushHead(ql, "hello", 6); + /* 1 for head and 1 for tail because 1 node = head = tail */ + ql_verify(ql, 1, 1, 1, 1); + quicklistRelease(ql); + long long stop = mstime(); + runtime[_i] += stop - start; + } + UNUSED(iter); + TEST_ASSERT(err == 0); + return 0; +} + +int test_quicklistAddToTail5xAtCompress(int argc, char **argv, int flags) { + UNUSED(argc); + UNUSED(argv); + UNUSED(flags); + quicklistIter *iter; + TEST("add to tail 5x at compress"); + for (int _i = 0; _i < option_count; _i++) { + long long start = mstime(); + for (int f = 0; f < fill_count; f++) { + quicklist *ql = quicklistNew(fills[f], options[_i]); + for (int i = 0; i < 5; i++) quicklistPushTail(ql, genstr("hello", i), 32); + if (ql->count != 5) { + err++; + }; + if (fills[f] == 32) ql_verify(ql, 1, 5, 5, 5); + quicklistRelease(ql); + } + long long stop = mstime(); + runtime[_i] += stop - start; + } + UNUSED(iter); + TEST_ASSERT(err == 0); + return 0; +} + +int test_quicklistAddToHead5xAtCompress(int argc, char **argv, int flags) { + UNUSED(argc); + UNUSED(argv); + UNUSED(flags); + quicklistIter *iter; + TEST("add to head 5x at compress"); + for (int _i = 0; _i < option_count; _i++) { + long long start = mstime(); + for (int f = 0; f < fill_count; f++) { + quicklist *ql = quicklistNew(fills[f], options[_i]); + for (int i = 0; i < 5; i++) quicklistPushHead(ql, genstr("hello", i), 32); + if (ql->count != 5) { + err++; + }; + if (fills[f] == 32) ql_verify(ql, 1, 5, 5, 5); + quicklistRelease(ql); + } + long long stop = mstime(); + runtime[_i] += stop - start; + } + UNUSED(iter); + TEST_ASSERT(err == 0); + return 0; +} + +int test_quicklistAddToTail500xAtCompress(int argc, char **argv, int flags) { + UNUSED(argc); + UNUSED(argv); + UNUSED(flags); + quicklistIter *iter; + TEST("add to tail 500x at compress"); + for (int _i = 0; _i < option_count; _i++) { + long long start = mstime(); + for (int f = 0; f < fill_count; f++) { + quicklist *ql = quicklistNew(fills[f], options[_i]); + for (int i = 0; i < 500; i++) quicklistPushTail(ql, genstr("hello", i), 64); + if (ql->count != 500) { + err++; + }; + if (fills[f] == 32) ql_verify(ql, 16, 500, 32, 20); + quicklistRelease(ql); + } + long long stop = mstime(); + runtime[_i] += stop - start; + } + UNUSED(iter); + TEST_ASSERT(err == 0); + return 0; +} + +int test_quicklistAddToHead500xAtCompress(int argc, char **argv, int flags) { + UNUSED(argc); + UNUSED(argv); + UNUSED(flags); + quicklistIter *iter; + TEST("add to head 500x at compress"); + for (int _i = 0; _i < option_count; _i++) { + long long start = mstime(); + for (int f = 0; f < fill_count; f++) { + quicklist *ql = quicklistNew(fills[f], options[_i]); + for (int i = 0; i < 500; i++) quicklistPushHead(ql, genstr("hello", i), 32); + if (ql->count != 500) { + err++; + }; + if (fills[f] == 32) ql_verify(ql, 16, 500, 20, 32); + quicklistRelease(ql); + } + long long stop = mstime(); + runtime[_i] += stop - start; + } + UNUSED(iter); + TEST_ASSERT(err == 0); + return 0; +} + +int test_quicklistRotateEmpty(int argc, char **argv, int flags) { + UNUSED(argc); + UNUSED(argv); + UNUSED(flags); + quicklistIter *iter; + TEST("rotate empty"); + for (int _i = 0; _i < option_count; _i++) { + long long start = mstime(); + quicklist *ql = quicklistNew(-2, options[_i]); + quicklistRotate(ql); + ql_verify(ql, 0, 0, 0, 0); + quicklistRelease(ql); + long long stop = mstime(); + runtime[_i] += stop - start; + } + UNUSED(iter); + TEST_ASSERT(err == 0); + return 0; +} + +int test_quicklistComprassionPlainNode(int argc, char **argv, int flags) { + UNUSED(argc); + UNUSED(argv); + UNUSED(flags); + quicklistIter *iter; + TEST("Comprassion Plain node"); + for (int _i = 0; _i < option_count; _i++) { + long long start = mstime(); + for (int f = 0; f < fill_count; f++) { + size_t large_limit = (fills[f] < 0) ? quicklistNodeNegFillLimit(fills[f]) + 1 : SIZE_SAFETY_LIMIT + 1; + + char buf[large_limit]; + quicklist *ql = quicklistNew(fills[f], 1); + for (int i = 0; i < 500; i++) { + /* Set to 256 to allow the node to be triggered to compress, + * if it is less than 48(nocompress), the test will be successful. */ + snprintf(buf, sizeof(buf), "hello%d", i); + quicklistPushHead(ql, buf, large_limit); + } + + quicklistIter *iter = quicklistGetIterator(ql, AL_START_TAIL); + quicklistEntry entry; + int i = 0; + while (quicklistNext(iter, &entry)) { + TEST_ASSERT(QL_NODE_IS_PLAIN(entry.node)); + snprintf(buf, sizeof(buf), "hello%d", i); + if (strcmp((char *)entry.value, buf)) { + TEST_PRINT_INFO("value [%s] didn't match [%s] at position %d", entry.value, buf, i); + err++; + } + i++; + } + ql_release_iterator(iter); + quicklistRelease(ql); + } + long long stop = mstime(); + runtime[_i] += stop - start; + } + UNUSED(iter); + TEST_ASSERT(err == 0); + return 0; +} + +int test_quicklistNextPlainNode(int argc, char **argv, int flags) { + UNUSED(argc); + UNUSED(argv); + UNUSED(flags); + quicklistIter *iter; + TEST("NEXT plain node"); + for (int _i = 0; _i < option_count; _i++) { + long long start = mstime(); + for (int f = 0; f < fill_count; f++) { + size_t large_limit = (fills[f] < 0) ? quicklistNodeNegFillLimit(fills[f]) + 1 : SIZE_SAFETY_LIMIT + 1; + quicklist *ql = quicklistNew(fills[f], options[_i]); + + char buf[large_limit]; + memcpy(buf, "plain", 5); + quicklistPushHead(ql, buf, large_limit); + quicklistPushHead(ql, buf, large_limit); + quicklistPushHead(ql, "packed3", 7); + quicklistPushHead(ql, "packed4", 7); + quicklistPushHead(ql, buf, large_limit); + + quicklistEntry entry; + quicklistIter *iter = quicklistGetIterator(ql, AL_START_TAIL); + + while (quicklistNext(iter, &entry) != 0) { + if (QL_NODE_IS_PLAIN(entry.node)) + TEST_ASSERT(!memcmp(entry.value, "plain", 5)); + else + TEST_ASSERT(!memcmp(entry.value, "packed", 6)); + } + ql_release_iterator(iter); + quicklistRelease(ql); + } + long long stop = mstime(); + runtime[_i] += stop - start; + } + UNUSED(iter); + TEST_ASSERT(err == 0); + return 0; +} + +int test_quicklistRotatePlainNode(int argc, char **argv, int flags) { + UNUSED(argc); + UNUSED(argv); + UNUSED(flags); + quicklistIter *iter; + TEST("rotate plain node"); + for (int _i = 0; _i < option_count; _i++) { + long long start = mstime(); + for (int f = 0; f < fill_count; f++) { + size_t large_limit = (fills[f] < 0) ? quicklistNodeNegFillLimit(fills[f]) + 1 : SIZE_SAFETY_LIMIT + 1; + + unsigned char *data = NULL; + size_t sz; + long long lv; + int i = 0; + quicklist *ql = quicklistNew(fills[f], options[_i]); + char buf[large_limit]; + memcpy(buf, "hello1", 6); + quicklistPushHead(ql, buf, large_limit); + memcpy(buf, "hello4", 6); + quicklistPushHead(ql, buf, large_limit); + memcpy(buf, "hello3", 6); + quicklistPushHead(ql, buf, large_limit); + memcpy(buf, "hello2", 6); + quicklistPushHead(ql, buf, large_limit); + quicklistRotate(ql); + + for (i = 1; i < 5; i++) { + TEST_ASSERT(QL_NODE_IS_PLAIN(ql->tail)); + quicklistPop(ql, QUICKLIST_HEAD, &data, &sz, &lv); + int temp_char = data[5]; + zfree(data); + TEST_ASSERT(temp_char == ('0' + i)); + } + + ql_verify(ql, 0, 0, 0, 0); + quicklistRelease(ql); + } + long long stop = mstime(); + runtime[_i] += stop - start; + } + UNUSED(iter); + TEST_ASSERT(err == 0); + return 0; +} + +int test_quicklistRotateOneValOnce(int argc, char **argv, int flags) { + UNUSED(argc); + UNUSED(argv); + UNUSED(flags); + quicklistIter *iter; + TEST("rotate one val once"); + for (int _i = 0; _i < option_count; _i++) { + long long start = mstime(); + for (int f = 0; f < fill_count; f++) { + quicklist *ql = quicklistNew(fills[f], options[_i]); + quicklistPushHead(ql, "hello", 6); + quicklistRotate(ql); + /* Ignore compression verify because listpack is + * too small to compress. */ + ql_verify(ql, 1, 1, 1, 1); + quicklistRelease(ql); + } + long long stop = mstime(); + runtime[_i] += stop - start; + } + UNUSED(iter); + TEST_ASSERT(err == 0); + return 0; +} + +int test_quicklistRotate500Val5000TimesAtCompress(int argc, char **argv, int flags) { + UNUSED(argc); + UNUSED(argv); + UNUSED(flags); + quicklistIter *iter; + TEST("rotate 500 val 5000 times at compress"); + for (int _i = 0; _i < option_count; _i++) { + long long start = mstime(); + for (int f = 0; f < fill_count; f++) { + quicklist *ql = quicklistNew(fills[f], options[_i]); + quicklistPushHead(ql, "900", 3); + quicklistPushHead(ql, "7000", 4); + quicklistPushHead(ql, "-1200", 5); + quicklistPushHead(ql, "42", 2); + for (int i = 0; i < 500; i++) quicklistPushHead(ql, genstr("hello", i), 64); + ql_info(ql); + for (int i = 0; i < 5000; i++) { + ql_info(ql); + quicklistRotate(ql); + } + if (fills[f] == 1) + ql_verify(ql, 504, 504, 1, 1); + else if (fills[f] == 2) + ql_verify(ql, 252, 504, 2, 2); + else if (fills[f] == 32) + ql_verify(ql, 16, 504, 32, 24); + quicklistRelease(ql); + } + long long stop = mstime(); + runtime[_i] += stop - start; + } + UNUSED(iter); + TEST_ASSERT(err == 0); + return 0; +} + +int test_quicklistPopEmpty(int argc, char **argv, int flags) { + UNUSED(argc); + UNUSED(argv); + UNUSED(flags); + quicklistIter *iter; + TEST("pop empty"); + for (int _i = 0; _i < option_count; _i++) { + long long start = mstime(); + quicklist *ql = quicklistNew(-2, options[_i]); + quicklistPop(ql, QUICKLIST_HEAD, NULL, NULL, NULL); + ql_verify(ql, 0, 0, 0, 0); + quicklistRelease(ql); + long long stop = mstime(); + runtime[_i] += stop - start; + } + UNUSED(iter); + TEST_ASSERT(err == 0); + return 0; +} + +int test_quicklistPop1StringFrom1(int argc, char **argv, int flags) { + UNUSED(argc); + UNUSED(argv); + UNUSED(flags); + quicklistIter *iter; + TEST("pop 1 string from 1"); + for (int _i = 0; _i < option_count; _i++) { + long long start = mstime(); + quicklist *ql = quicklistNew(-2, options[_i]); + char *populate = genstr("hello", 331); + quicklistPushHead(ql, populate, 32); + unsigned char *data; + size_t sz; + long long lv; + ql_info(ql); + TEST_ASSERT(quicklistPop(ql, QUICKLIST_HEAD, &data, &sz, &lv)); + TEST_ASSERT(data != NULL); + TEST_ASSERT(sz == 32); + if (strcmp(populate, (char *)data)) { + int size = sz; + TEST_PRINT_INFO("Pop'd value (%.*s) didn't equal original value (%s)", size, data, populate); + err++; + } + zfree(data); + ql_verify(ql, 0, 0, 0, 0); + quicklistRelease(ql); + long long stop = mstime(); + runtime[_i] += stop - start; + } + UNUSED(iter); + TEST_ASSERT(err == 0); + return 0; +} + +int test_quicklistPopHead1NumberFrom1(int argc, char **argv, int flags) { + UNUSED(argc); + UNUSED(argv); + UNUSED(flags); + quicklistIter *iter; + TEST("pop head 1 number from 1"); + for (int _i = 0; _i < option_count; _i++) { + long long start = mstime(); + quicklist *ql = quicklistNew(-2, options[_i]); + quicklistPushHead(ql, "55513", 5); + unsigned char *data; + size_t sz; + long long lv; + ql_info(ql); + TEST_ASSERT(quicklistPop(ql, QUICKLIST_HEAD, &data, &sz, &lv)); + TEST_ASSERT(data == NULL); + TEST_ASSERT(lv == 55513); + ql_verify(ql, 0, 0, 0, 0); + quicklistRelease(ql); + long long stop = mstime(); + runtime[_i] += stop - start; + } + UNUSED(iter); + TEST_ASSERT(err == 0); + return 0; +} + +int test_quicklistPopHead500From500(int argc, char **argv, int flags) { + UNUSED(argc); + UNUSED(argv); + UNUSED(flags); + quicklistIter *iter; + TEST("pop head 500 from 500"); + for (int _i = 0; _i < option_count; _i++) { + long long start = mstime(); + quicklist *ql = quicklistNew(-2, options[_i]); + for (int i = 0; i < 500; i++) quicklistPushHead(ql, genstr("hello", i), 32); + ql_info(ql); + for (int i = 0; i < 500; i++) { + unsigned char *data; + size_t sz; + long long lv; + int ret = quicklistPop(ql, QUICKLIST_HEAD, &data, &sz, &lv); + TEST_ASSERT(ret == 1); + TEST_ASSERT(data != NULL); + TEST_ASSERT(sz == 32); + if (strcmp(genstr("hello", 499 - i), (char *)data)) { + int size = sz; + TEST_PRINT_INFO("Pop'd value (%.*s) didn't equal original value (%s)", size, data, genstr("hello", 499 - i)); + err++; + } + zfree(data); + } + ql_verify(ql, 0, 0, 0, 0); + quicklistRelease(ql); + long long stop = mstime(); + runtime[_i] += stop - start; + } + UNUSED(iter); + TEST_ASSERT(err == 0); + return 0; +} + +int test_quicklistPopHead5000From500(int argc, char **argv, int flags) { + UNUSED(argc); + UNUSED(argv); + UNUSED(flags); + quicklistIter *iter; + TEST("pop head 5000 from 500"); + for (int _i = 0; _i < option_count; _i++) { + long long start = mstime(); + quicklist *ql = quicklistNew(-2, options[_i]); + for (int i = 0; i < 500; i++) quicklistPushHead(ql, genstr("hello", i), 32); + for (int i = 0; i < 5000; i++) { + unsigned char *data; + size_t sz; + long long lv; + int ret = quicklistPop(ql, QUICKLIST_HEAD, &data, &sz, &lv); + if (i < 500) { + TEST_ASSERT(ret == 1); + TEST_ASSERT(data != NULL); + TEST_ASSERT(sz == 32); + if (strcmp(genstr("hello", 499 - i), (char *)data)) { + int size = sz; + TEST_PRINT_INFO("Pop'd value (%.*s) didn't equal original value " + "(%s)", + size, data, genstr("hello", 499 - i)); + err++; + } + zfree(data); + } else { + TEST_ASSERT(ret == 0); + } + } + ql_verify(ql, 0, 0, 0, 0); + quicklistRelease(ql); + long long stop = mstime(); + runtime[_i] += stop - start; + } + UNUSED(iter); + TEST_ASSERT(err == 0); + return 0; +} + +int test_quicklistIterateForwardOver500List(int argc, char **argv, int flags) { + UNUSED(argc); + UNUSED(argv); + UNUSED(flags); + quicklistIter *iter; + TEST("iterate forward over 500 list"); + for (int _i = 0; _i < option_count; _i++) { + long long start = mstime(); + quicklist *ql = quicklistNew(-2, options[_i]); + quicklistSetFill(ql, 32); + for (int i = 0; i < 500; i++) quicklistPushHead(ql, genstr("hello", i), 32); + quicklistIter *iter = quicklistGetIterator(ql, AL_START_HEAD); + quicklistEntry entry; + int i = 499, count = 0; + while (quicklistNext(iter, &entry)) { + char *h = genstr("hello", i); + if (strcmp((char *)entry.value, h)) { + TEST_PRINT_INFO("value [%s] didn't match [%s] at position %d", entry.value, h, i); + err++; + } + i--; + count++; + } + if (count != 500) { + TEST_PRINT_INFO("Didn't iterate over exactly 500 elements (%d)", i); + err++; + } + ql_verify(ql, 16, 500, 20, 32); + ql_release_iterator(iter); + quicklistRelease(ql); + long long stop = mstime(); + runtime[_i] += stop - start; + } + UNUSED(iter); + TEST_ASSERT(err == 0); + return 0; +} + +int test_quicklistIterateReverseOver500List(int argc, char **argv, int flags) { + UNUSED(argc); + UNUSED(argv); + UNUSED(flags); + quicklistIter *iter; + TEST("iterate reverse over 500 list"); + for (int _i = 0; _i < option_count; _i++) { + long long start = mstime(); + quicklist *ql = quicklistNew(-2, options[_i]); + quicklistSetFill(ql, 32); + for (int i = 0; i < 500; i++) quicklistPushHead(ql, genstr("hello", i), 32); + quicklistIter *iter = quicklistGetIterator(ql, AL_START_TAIL); + quicklistEntry entry; + int i = 0; + while (quicklistNext(iter, &entry)) { + char *h = genstr("hello", i); + if (strcmp((char *)entry.value, h)) { + TEST_PRINT_INFO("value [%s] didn't match [%s] at position %d", entry.value, h, i); + err++; + } + i++; + } + if (i != 500) { + TEST_PRINT_INFO("Didn't iterate over exactly 500 elements (%d)", i); + err++; + } + ql_verify(ql, 16, 500, 20, 32); + ql_release_iterator(iter); + quicklistRelease(ql); + long long stop = mstime(); + runtime[_i] += stop - start; + } + UNUSED(iter); + TEST_ASSERT(err == 0); + return 0; +} + +int test_quicklistInsertAfter1Element(int argc, char **argv, int flags) { + UNUSED(argc); + UNUSED(argv); + UNUSED(flags); + quicklistIter *iter; + TEST("insert after 1 element"); + for (int _i = 0; _i < option_count; _i++) { + long long start = mstime(); + quicklist *ql = quicklistNew(-2, options[_i]); + quicklistPushHead(ql, "hello", 6); + quicklistEntry entry; + iter = quicklistGetIteratorEntryAtIdx(ql, 0, &entry); + quicklistInsertAfter(iter, &entry, "abc", 4); + ql_release_iterator(iter); + ql_verify(ql, 1, 2, 2, 2); + + /* verify results */ + iter = quicklistGetIteratorEntryAtIdx(ql, 0, &entry); + int sz = entry.sz; + if (strncmp((char *)entry.value, "hello", 5)) { + TEST_PRINT_INFO("Value 0 didn't match, instead got: %.*s", sz, entry.value); + err++; + } + ql_release_iterator(iter); + + iter = quicklistGetIteratorEntryAtIdx(ql, 1, &entry); + sz = entry.sz; + if (strncmp((char *)entry.value, "abc", 3)) { + TEST_PRINT_INFO("Value 1 didn't match, instead got: %.*s", sz, entry.value); + err++; + } + ql_release_iterator(iter); + quicklistRelease(ql); + long long stop = mstime(); + runtime[_i] += stop - start; + } + UNUSED(iter); + TEST_ASSERT(err == 0); + return 0; +} + +int test_quicklistInsertBefore1Element(int argc, char **argv, int flags) { + UNUSED(argc); + UNUSED(argv); + UNUSED(flags); + quicklistIter *iter; + TEST("insert before 1 element"); + for (int _i = 0; _i < option_count; _i++) { + long long start = mstime(); + quicklist *ql = quicklistNew(-2, options[_i]); + quicklistPushHead(ql, "hello", 6); + quicklistEntry entry; + iter = quicklistGetIteratorEntryAtIdx(ql, 0, &entry); + quicklistInsertBefore(iter, &entry, "abc", 4); + ql_release_iterator(iter); + ql_verify(ql, 1, 2, 2, 2); + + /* verify results */ + iter = quicklistGetIteratorEntryAtIdx(ql, 0, &entry); + int sz = entry.sz; + if (strncmp((char *)entry.value, "abc", 3)) { + TEST_PRINT_INFO("Value 0 didn't match, instead got: %.*s", sz, entry.value); + err++; + } + ql_release_iterator(iter); + + iter = quicklistGetIteratorEntryAtIdx(ql, 1, &entry); + sz = entry.sz; + if (strncmp((char *)entry.value, "hello", 5)) { + TEST_PRINT_INFO("Value 1 didn't match, instead got: %.*s", sz, entry.value); + err++; + } + ql_release_iterator(iter); + quicklistRelease(ql); + long long stop = mstime(); + runtime[_i] += stop - start; + } + UNUSED(iter); + TEST_ASSERT(err == 0); + return 0; +} + +int test_quicklistInsertHeadWhileHeadNodeIsFull(int argc, char **argv, int flags) { + UNUSED(argc); + UNUSED(argv); + UNUSED(flags); + quicklistIter *iter; + TEST("insert head while head node is full"); + for (int _i = 0; _i < option_count; _i++) { + long long start = mstime(); + quicklist *ql = quicklistNew(4, options[_i]); + for (int i = 0; i < 10; i++) quicklistPushTail(ql, genstr("hello", i), 6); + quicklistSetFill(ql, -1); + quicklistEntry entry; + iter = quicklistGetIteratorEntryAtIdx(ql, -10, &entry); + char buf[4096] = {0}; + quicklistInsertBefore(iter, &entry, buf, 4096); + ql_release_iterator(iter); + ql_verify(ql, 4, 11, 1, 2); + quicklistRelease(ql); + long long stop = mstime(); + runtime[_i] += stop - start; + } + UNUSED(iter); + TEST_ASSERT(err == 0); + return 0; +} + +int test_quicklistInsertTailWhileTailNodeIsFull(int argc, char **argv, int flags) { + UNUSED(argc); + UNUSED(argv); + UNUSED(flags); + quicklistIter *iter; + TEST("insert tail while tail node is full"); + for (int _i = 0; _i < option_count; _i++) { + long long start = mstime(); + quicklist *ql = quicklistNew(4, options[_i]); + for (int i = 0; i < 10; i++) quicklistPushHead(ql, genstr("hello", i), 6); + quicklistSetFill(ql, -1); + quicklistEntry entry; + iter = quicklistGetIteratorEntryAtIdx(ql, -1, &entry); + char buf[4096] = {0}; + quicklistInsertAfter(iter, &entry, buf, 4096); + ql_release_iterator(iter); + ql_verify(ql, 4, 11, 2, 1); + quicklistRelease(ql); + + long long stop = mstime(); + runtime[_i] += stop - start; + } + UNUSED(iter); + TEST_ASSERT(err == 0); + return 0; +} + +int test_quicklistInsertOnceInElementsWhileIteratingAtCompress(int argc, char **argv, int flags) { + UNUSED(argc); + UNUSED(argv); + UNUSED(flags); + quicklistIter *iter; + TEST("insert once in elements while iterating at compress"); + + for (int _i = 0; _i < option_count; _i++) { + long long start = mstime(); + for (int f = 0; f < fill_count; f++) { + quicklist *ql = quicklistNew(fills[f], options[_i]); + quicklistPushTail(ql, "abc", 3); + quicklistSetFill(ql, 1); + quicklistPushTail(ql, "def", 3); /* force to unique node */ + quicklistSetFill(ql, f); + quicklistPushTail(ql, "bob", 3); /* force to reset for +3 */ + quicklistPushTail(ql, "foo", 3); + quicklistPushTail(ql, "zoo", 3); + + itrprintr(ql, 0); + /* insert "bar" before "bob" while iterating over list. */ + quicklistIter *iter = quicklistGetIterator(ql, AL_START_HEAD); + quicklistEntry entry; + while (quicklistNext(iter, &entry)) { + if (!strncmp((char *)entry.value, "bob", 3)) { + /* Insert as fill = 1 so it spills into new node. */ + quicklistInsertBefore(iter, &entry, "bar", 3); + break; /* didn't we fix insert-while-iterating? */ + } + } + ql_release_iterator(iter); + itrprintr(ql, 0); + + /* verify results */ + iter = quicklistGetIteratorEntryAtIdx(ql, 0, &entry); + int sz = entry.sz; + + if (strncmp((char *)entry.value, "abc", 3)) { + TEST_PRINT_INFO("Value 0 didn't match, instead got: %.*s", sz, entry.value); + err++; + } + ql_release_iterator(iter); + + iter = quicklistGetIteratorEntryAtIdx(ql, 1, &entry); + if (strncmp((char *)entry.value, "def", 3)) { + TEST_PRINT_INFO("Value 1 didn't match, instead got: %.*s", sz, entry.value); + err++; + } + ql_release_iterator(iter); + + iter = quicklistGetIteratorEntryAtIdx(ql, 2, &entry); + if (strncmp((char *)entry.value, "bar", 3)) { + TEST_PRINT_INFO("Value 2 didn't match, instead got: %.*s", sz, entry.value); + err++; + } + ql_release_iterator(iter); + + iter = quicklistGetIteratorEntryAtIdx(ql, 3, &entry); + if (strncmp((char *)entry.value, "bob", 3)) { + TEST_PRINT_INFO("Value 3 didn't match, instead got: %.*s", sz, entry.value); + err++; + } + ql_release_iterator(iter); + + iter = quicklistGetIteratorEntryAtIdx(ql, 4, &entry); + if (strncmp((char *)entry.value, "foo", 3)) { + TEST_PRINT_INFO("Value 4 didn't match, instead got: %.*s", sz, entry.value); + err++; + } + ql_release_iterator(iter); + + iter = quicklistGetIteratorEntryAtIdx(ql, 5, &entry); + if (strncmp((char *)entry.value, "zoo", 3)) { + TEST_PRINT_INFO("Value 5 didn't match, instead got: %.*s", sz, entry.value); + err++; + } + ql_release_iterator(iter); + quicklistRelease(ql); + } + long long stop = mstime(); + runtime[_i] += stop - start; + } + + UNUSED(iter); + TEST_ASSERT(err == 0); + return 0; +} + +int test_quicklistInsertBefore250NewInMiddleOf500ElementsAtCompress(int argc, char **argv, int flags) { + UNUSED(argc); + UNUSED(argv); + UNUSED(flags); + quicklistIter *iter; + TEST("insert [before] 250 new in middle of 500 elements at compress"); + + for (int _i = 0; _i < option_count; _i++) { + long long start = mstime(); + for (int f = 0; f < fill_count; f++) { + quicklist *ql = quicklistNew(fills[f], options[_i]); + for (int i = 0; i < 500; i++) quicklistPushTail(ql, genstr("hello", i), 32); + for (int i = 0; i < 250; i++) { + quicklistEntry entry; + iter = quicklistGetIteratorEntryAtIdx(ql, 250, &entry); + quicklistInsertBefore(iter, &entry, genstr("abc", i), 32); + ql_release_iterator(iter); + } + if (fills[f] == 32) ql_verify(ql, 25, 750, 32, 20); + quicklistRelease(ql); + } + + long long stop = mstime(); + runtime[_i] += stop - start; + } + UNUSED(iter); + TEST_ASSERT(err == 0); + return 0; +} + +int test_quicklistInsertAfter250NewInMiddleOf500ElementsAtCompress(int argc, char **argv, int flags) { + UNUSED(argc); + UNUSED(argv); + UNUSED(flags); + quicklistIter *iter; + TEST("insert [after] 250 new in middle of 500 elements at compress"); + + for (int _i = 0; _i < option_count; _i++) { + long long start = mstime(); + for (int f = 0; f < fill_count; f++) { + quicklist *ql = quicklistNew(fills[f], options[_i]); + for (int i = 0; i < 500; i++) quicklistPushHead(ql, genstr("hello", i), 32); + for (int i = 0; i < 250; i++) { + quicklistEntry entry; + iter = quicklistGetIteratorEntryAtIdx(ql, 250, &entry); + quicklistInsertAfter(iter, &entry, genstr("abc", i), 32); + ql_release_iterator(iter); + } + + if (ql->count != 750) { + TEST_PRINT_INFO("List size not 750, but rather %ld", ql->count); + err++; + } + + if (fills[f] == 32) ql_verify(ql, 26, 750, 20, 32); + quicklistRelease(ql); + } + + long long stop = mstime(); + runtime[_i] += stop - start; + } + UNUSED(iter); + TEST_ASSERT(err == 0); + return 0; +} + +int test_quicklistDuplicateEmptyList(int argc, char **argv, int flags) { + UNUSED(argc); + UNUSED(argv); + UNUSED(flags); + quicklistIter *iter; + TEST("duplicate empty list"); + + for (int _i = 0; _i < option_count; _i++) { + long long start = mstime(); + quicklist *ql = quicklistNew(-2, options[_i]); + ql_verify(ql, 0, 0, 0, 0); + quicklist *copy = quicklistDup(ql); + ql_verify(copy, 0, 0, 0, 0); + quicklistRelease(ql); + quicklistRelease(copy); + + long long stop = mstime(); + runtime[_i] += stop - start; + } + UNUSED(iter); + TEST_ASSERT(err == 0); + return 0; +} + +int test_quicklistDuplicateListOf1Element(int argc, char **argv, int flags) { + UNUSED(argc); + UNUSED(argv); + UNUSED(flags); + quicklistIter *iter; + TEST("duplicate list of 1 element"); + + for (int _i = 0; _i < option_count; _i++) { + long long start = mstime(); + quicklist *ql = quicklistNew(-2, options[_i]); + quicklistPushHead(ql, genstr("hello", 3), 32); + ql_verify(ql, 1, 1, 1, 1); + quicklist *copy = quicklistDup(ql); + ql_verify(copy, 1, 1, 1, 1); + quicklistRelease(ql); + quicklistRelease(copy); + + long long stop = mstime(); + runtime[_i] += stop - start; + } + + UNUSED(iter); + TEST_ASSERT(err == 0); + return 0; +} + +int test_quicklistDuplicateListOf500(int argc, char **argv, int flags) { + UNUSED(argc); + UNUSED(argv); + UNUSED(flags); + quicklistIter *iter; + TEST("duplicate list of 500"); + + for (int _i = 0; _i < option_count; _i++) { + long long start = mstime(); + quicklist *ql = quicklistNew(-2, options[_i]); + quicklistSetFill(ql, 32); + for (int i = 0; i < 500; i++) quicklistPushHead(ql, genstr("hello", i), 32); + ql_verify(ql, 16, 500, 20, 32); + + quicklist *copy = quicklistDup(ql); + ql_verify(copy, 16, 500, 20, 32); + quicklistRelease(ql); + quicklistRelease(copy); + + long long stop = mstime(); + runtime[_i] += stop - start; + } + + UNUSED(iter); + TEST_ASSERT(err == 0); + return 0; +} + +int test_quicklistIndex1200From500ListAtFill(int argc, char **argv, int flags) { + UNUSED(argc); + UNUSED(argv); + UNUSED(flags); + quicklistIter *iter; + TEST("index 1,200 from 500 list at fill at compress"); + + for (int _i = 0; _i < option_count; _i++) { + long long start = mstime(); + for (int f = 0; f < fill_count; f++) { + quicklist *ql = quicklistNew(fills[f], options[_i]); + for (int i = 0; i < 500; i++) quicklistPushTail(ql, genstr("hello", i + 1), 32); + quicklistEntry entry; + iter = quicklistGetIteratorEntryAtIdx(ql, 1, &entry); + if (strcmp((char *)entry.value, "hello2") != 0) { + TEST_PRINT_INFO("Value: %s", entry.value); + err++; + } + ql_release_iterator(iter); + + iter = quicklistGetIteratorEntryAtIdx(ql, 200, &entry); + if (strcmp((char *)entry.value, "hello201") != 0) { + TEST_PRINT_INFO("Value: %s", entry.value); + err++; + } + ql_release_iterator(iter); + quicklistRelease(ql); + } + + long long stop = mstime(); + runtime[_i] += stop - start; + } + UNUSED(iter); + TEST_ASSERT(err == 0); + return 0; +} + +int test_quicklistIndex12From500ListAtFill(int argc, char **argv, int flags) { + UNUSED(argc); + UNUSED(argv); + UNUSED(flags); + quicklistIter *iter; + TEST("index -1,-2 from 500 list at fill at compress"); + + for (int _i = 0; _i < option_count; _i++) { + long long start = mstime(); + for (int f = 0; f < fill_count; f++) { + quicklist *ql = quicklistNew(fills[f], options[_i]); + for (int i = 0; i < 500; i++) quicklistPushTail(ql, genstr("hello", i + 1), 32); + quicklistEntry entry; + iter = quicklistGetIteratorEntryAtIdx(ql, -1, &entry); + if (strcmp((char *)entry.value, "hello500") != 0) { + TEST_PRINT_INFO("Value: %s", entry.value); + err++; + } + ql_release_iterator(iter); + + iter = quicklistGetIteratorEntryAtIdx(ql, -2, &entry); + if (strcmp((char *)entry.value, "hello499") != 0) { + TEST_PRINT_INFO("Value: %s", entry.value); + err++; + } + ql_release_iterator(iter); + quicklistRelease(ql); + } + + long long stop = mstime(); + runtime[_i] += stop - start; + } + UNUSED(iter); + TEST_ASSERT(err == 0); + return 0; +} + +int test_quicklistIndex100From500ListAtFill(int argc, char **argv, int flags) { + UNUSED(argc); + UNUSED(argv); + UNUSED(flags); + quicklistIter *iter; + TEST("index -100 from 500 list at fill at compress"); + for (int _i = 0; _i < option_count; _i++) { + long long start = mstime(); + for (int f = 0; f < fill_count; f++) { + quicklist *ql = quicklistNew(fills[f], options[_i]); + for (int i = 0; i < 500; i++) quicklistPushTail(ql, genstr("hello", i + 1), 32); + quicklistEntry entry; + iter = quicklistGetIteratorEntryAtIdx(ql, -100, &entry); + if (strcmp((char *)entry.value, "hello401") != 0) { + TEST_PRINT_INFO("Value: %s", entry.value); + err++; + } + ql_release_iterator(iter); + quicklistRelease(ql); + } + + long long stop = mstime(); + runtime[_i] += stop - start; + } + UNUSED(iter); + TEST_ASSERT(err == 0); + return 0; +} + +int test_quicklistIndexTooBig1From50ListAtFill(int argc, char **argv, int flags) { + UNUSED(argc); + UNUSED(argv); + UNUSED(flags); + quicklistIter *iter; + TEST("index too big +1 from 50 list at fill at compress"); + + for (int _i = 0; _i < option_count; _i++) { + long long start = mstime(); + for (int f = 0; f < fill_count; f++) { + quicklist *ql = quicklistNew(fills[f], options[_i]); + for (int i = 0; i < 50; i++) quicklistPushTail(ql, genstr("hello", i + 1), 32); + quicklistEntry entry; + int sz = entry.sz; + iter = quicklistGetIteratorEntryAtIdx(ql, 50, &entry); + if (iter) { + TEST_PRINT_INFO("Index found at 50 with 50 list: %.*s", sz, entry.value); + err++; + } + ql_release_iterator(iter); + quicklistRelease(ql); + } + + + long long stop = mstime(); + runtime[_i] += stop - start; + } + UNUSED(iter); + TEST_ASSERT(err == 0); + return 0; +} + +int test_quicklistDeleteRangeEmptyList(int argc, char **argv, int flags) { + UNUSED(argc); + UNUSED(argv); + UNUSED(flags); + quicklistIter *iter; + TEST("delete range empty list"); + for (int _i = 0; _i < option_count; _i++) { + long long start = mstime(); + + quicklist *ql = quicklistNew(-2, options[_i]); + quicklistDelRange(ql, 5, 20); + ql_verify(ql, 0, 0, 0, 0); + quicklistRelease(ql); + + long long stop = mstime(); + runtime[_i] += stop - start; + } + UNUSED(iter); + TEST_ASSERT(err == 0); + return 0; +} + +int test_quicklistDeleteRangeOfEntireNodeInListOfOneNode(int argc, char **argv, int flags) { + UNUSED(argc); + UNUSED(argv); + UNUSED(flags); + quicklistIter *iter; + TEST("delete range of entire node in list of one node"); + for (int _i = 0; _i < option_count; _i++) { + long long start = mstime(); + quicklist *ql = quicklistNew(-2, options[_i]); + for (int i = 0; i < 32; i++) quicklistPushHead(ql, genstr("hello", i), 32); + ql_verify(ql, 1, 32, 32, 32); + quicklistDelRange(ql, 0, 32); + ql_verify(ql, 0, 0, 0, 0); + quicklistRelease(ql); + + long long stop = mstime(); + runtime[_i] += stop - start; + } + UNUSED(iter); + TEST_ASSERT(err == 0); + return 0; +} + +int test_quicklistDeleteRangeOfEntireNodeWithOverflowCounts(int argc, char **argv, int flags) { + UNUSED(argc); + UNUSED(argv); + UNUSED(flags); + quicklistIter *iter; + TEST("delete range of entire node with overflow counts"); + for (int _i = 0; _i < option_count; _i++) { + long long start = mstime(); + quicklist *ql = quicklistNew(-2, options[_i]); + for (int i = 0; i < 32; i++) quicklistPushHead(ql, genstr("hello", i), 32); + ql_verify(ql, 1, 32, 32, 32); + quicklistDelRange(ql, 0, 128); + ql_verify(ql, 0, 0, 0, 0); + quicklistRelease(ql); + + + long long stop = mstime(); + runtime[_i] += stop - start; + } + UNUSED(iter); + TEST_ASSERT(err == 0); + return 0; +} + +int test_quicklistDeleteMiddle100Of500List(int argc, char **argv, int flags) { + UNUSED(argc); + UNUSED(argv); + UNUSED(flags); + quicklistIter *iter; + TEST("delete middle 100 of 500 list"); + for (int _i = 0; _i < option_count; _i++) { + long long start = mstime(); + quicklist *ql = quicklistNew(-2, options[_i]); + quicklistSetFill(ql, 32); + for (int i = 0; i < 500; i++) quicklistPushTail(ql, genstr("hello", i + 1), 32); + ql_verify(ql, 16, 500, 32, 20); + quicklistDelRange(ql, 200, 100); + ql_verify(ql, 14, 400, 32, 20); + quicklistRelease(ql); + + long long stop = mstime(); + runtime[_i] += stop - start; + } + UNUSED(iter); + TEST_ASSERT(err == 0); + return 0; +} + +int test_quicklistDeleteLessThanFillButAcrossNodes(int argc, char **argv, int flags) { + UNUSED(argc); + UNUSED(argv); + UNUSED(flags); + quicklistIter *iter; + TEST("delete less than fill but across nodes"); + for (int _i = 0; _i < option_count; _i++) { + long long start = mstime(); + quicklist *ql = quicklistNew(-2, options[_i]); + quicklistSetFill(ql, 32); + for (int i = 0; i < 500; i++) quicklistPushTail(ql, genstr("hello", i + 1), 32); + ql_verify(ql, 16, 500, 32, 20); + quicklistDelRange(ql, 60, 10); + ql_verify(ql, 16, 490, 32, 20); + quicklistRelease(ql); + + long long stop = mstime(); + runtime[_i] += stop - start; + } + UNUSED(iter); + TEST_ASSERT(err == 0); + return 0; +} + +int test_quicklistDeleteNegative1From500List(int argc, char **argv, int flags) { + UNUSED(argc); + UNUSED(argv); + UNUSED(flags); + quicklistIter *iter; + TEST("delete negative 1 from 500 list"); + for (int _i = 0; _i < option_count; _i++) { + long long start = mstime(); + quicklist *ql = quicklistNew(-2, options[_i]); + quicklistSetFill(ql, 32); + for (int i = 0; i < 500; i++) quicklistPushTail(ql, genstr("hello", i + 1), 32); + ql_verify(ql, 16, 500, 32, 20); + quicklistDelRange(ql, -1, 1); + ql_verify(ql, 16, 499, 32, 19); + quicklistRelease(ql); + + long long stop = mstime(); + runtime[_i] += stop - start; + } + UNUSED(iter); + TEST_ASSERT(err == 0); + return 0; +} + +int test_quicklistDeleteNegative1From500ListWithOverflowCounts(int argc, char **argv, int flags) { + UNUSED(argc); + UNUSED(argv); + UNUSED(flags); + quicklistIter *iter; + TEST("delete negative 1 from 500 list with overflow counts"); + for (int _i = 0; _i < option_count; _i++) { + long long start = mstime(); + quicklist *ql = quicklistNew(-2, options[_i]); + quicklistSetFill(ql, 32); + for (int i = 0; i < 500; i++) quicklistPushTail(ql, genstr("hello", i + 1), 32); + ql_verify(ql, 16, 500, 32, 20); + quicklistDelRange(ql, -1, 128); + ql_verify(ql, 16, 499, 32, 19); + quicklistRelease(ql); + + long long stop = mstime(); + runtime[_i] += stop - start; + } + UNUSED(iter); + TEST_ASSERT(err == 0); + return 0; +} + +int test_quicklistDeleteNegative100From500List(int argc, char **argv, int flags) { + UNUSED(argc); + UNUSED(argv); + UNUSED(flags); + quicklistIter *iter; + TEST("delete negative 100 from 500 list"); + for (int _i = 0; _i < option_count; _i++) { + long long start = mstime(); + quicklist *ql = quicklistNew(-2, options[_i]); + quicklistSetFill(ql, 32); + for (int i = 0; i < 500; i++) quicklistPushTail(ql, genstr("hello", i + 1), 32); + quicklistDelRange(ql, -100, 100); + ql_verify(ql, 13, 400, 32, 16); + quicklistRelease(ql); + + long long stop = mstime(); + runtime[_i] += stop - start; + } + UNUSED(iter); + TEST_ASSERT(err == 0); + return 0; +} + +int test_quicklistDelete10Count5From50List(int argc, char **argv, int flags) { + UNUSED(argc); + UNUSED(argv); + UNUSED(flags); + quicklistIter *iter; + TEST("delete -10 count 5 from 50 list"); + for (int _i = 0; _i < option_count; _i++) { + long long start = mstime(); + quicklist *ql = quicklistNew(-2, options[_i]); + quicklistSetFill(ql, 32); + for (int i = 0; i < 50; i++) quicklistPushTail(ql, genstr("hello", i + 1), 32); + ql_verify(ql, 2, 50, 32, 18); + quicklistDelRange(ql, -10, 5); + ql_verify(ql, 2, 45, 32, 13); + quicklistRelease(ql); + + long long stop = mstime(); + runtime[_i] += stop - start; + } + UNUSED(iter); + TEST_ASSERT(err == 0); + return 0; +} + +int test_quicklistNumbersOnlyListRead(int argc, char **argv, int flags) { + UNUSED(argc); + UNUSED(argv); + UNUSED(flags); + quicklistIter *iter; + TEST("numbers only list read"); + for (int _i = 0; _i < option_count; _i++) { + long long start = mstime(); + quicklist *ql = quicklistNew(-2, options[_i]); + quicklistPushTail(ql, "1111", 4); + quicklistPushTail(ql, "2222", 4); + quicklistPushTail(ql, "3333", 4); + quicklistPushTail(ql, "4444", 4); + ql_verify(ql, 1, 4, 4, 4); + quicklistEntry entry; + iter = quicklistGetIteratorEntryAtIdx(ql, 0, &entry); + if (entry.longval != 1111) { + TEST_PRINT_INFO("Not 1111, %lld", entry.longval); + err++; + } + ql_release_iterator(iter); + + iter = quicklistGetIteratorEntryAtIdx(ql, 1, &entry); + if (entry.longval != 2222) { + TEST_PRINT_INFO("Not 2222, %lld", entry.longval); + err++; + } + ql_release_iterator(iter); + + iter = quicklistGetIteratorEntryAtIdx(ql, 2, &entry); + if (entry.longval != 3333) { + TEST_PRINT_INFO("Not 3333, %lld", entry.longval); + err++; + } + ql_release_iterator(iter); + + iter = quicklistGetIteratorEntryAtIdx(ql, 3, &entry); + if (entry.longval != 4444) { + TEST_PRINT_INFO("Not 4444, %lld", entry.longval); + err++; + } + ql_release_iterator(iter); + + iter = quicklistGetIteratorEntryAtIdx(ql, 4, &entry); + if (iter) { + TEST_PRINT_INFO("Index past elements: %lld", entry.longval); + err++; + } + ql_release_iterator(iter); + + iter = quicklistGetIteratorEntryAtIdx(ql, -1, &entry); + if (entry.longval != 4444) { + TEST_PRINT_INFO("Not 4444 (reverse), %lld", entry.longval); + err++; + } + ql_release_iterator(iter); + + iter = quicklistGetIteratorEntryAtIdx(ql, -2, &entry); + if (entry.longval != 3333) { + TEST_PRINT_INFO("Not 3333 (reverse), %lld", entry.longval); + err++; + } + ql_release_iterator(iter); + + iter = quicklistGetIteratorEntryAtIdx(ql, -3, &entry); + if (entry.longval != 2222) { + TEST_PRINT_INFO("Not 2222 (reverse), %lld", entry.longval); + err++; + } + ql_release_iterator(iter); + + iter = quicklistGetIteratorEntryAtIdx(ql, -4, &entry); + if (entry.longval != 1111) { + TEST_PRINT_INFO("Not 1111 (reverse), %lld", entry.longval); + err++; + } + ql_release_iterator(iter); + + iter = quicklistGetIteratorEntryAtIdx(ql, -5, &entry); + if (iter) { + TEST_PRINT_INFO("Index past elements (reverse), %lld", entry.longval); + err++; + } + ql_release_iterator(iter); + quicklistRelease(ql); + + long long stop = mstime(); + runtime[_i] += stop - start; + } + UNUSED(iter); + TEST_ASSERT(err == 0); + return 0; +} + +int test_quicklistNumbersLargerListRead(int argc, char **argv, int flags) { + UNUSED(argc); + UNUSED(argv); + UNUSED(flags); + quicklistIter *iter; + TEST("numbers larger list read"); + for (int _i = 0; _i < option_count; _i++) { + long long start = mstime(); + quicklist *ql = quicklistNew(-2, options[_i]); + quicklistSetFill(ql, 32); + char num[32]; + long long nums[5000]; + for (int i = 0; i < 5000; i++) { + nums[i] = -5157318210846258176 + i; + int sz = ll2string(num, sizeof(num), nums[i]); + quicklistPushTail(ql, num, sz); + } + quicklistPushTail(ql, "xxxxxxxxxxxxxxxxxxxx", 20); + quicklistEntry entry; + for (int i = 0; i < 5000; i++) { + iter = quicklistGetIteratorEntryAtIdx(ql, i, &entry); + if (entry.longval != nums[i]) { + TEST_PRINT_INFO("[%d] Not longval %lld but rather %lld", i, nums[i], entry.longval); + err++; + } + entry.longval = 0xdeadbeef; + ql_release_iterator(iter); + } + iter = quicklistGetIteratorEntryAtIdx(ql, 5000, &entry); + if (strncmp((char *)entry.value, "xxxxxxxxxxxxxxxxxxxx", 20)) { + TEST_PRINT_INFO("String val not match: %s", entry.value); + err++; + } + ql_verify(ql, 157, 5001, 32, 9); + ql_release_iterator(iter); + quicklistRelease(ql); + + long long stop = mstime(); + runtime[_i] += stop - start; + } + UNUSED(iter); + TEST_ASSERT(err == 0); + return 0; +} + +int test_quicklistNumbersLargerListReadB(int argc, char **argv, int flags) { + UNUSED(argc); + UNUSED(argv); + UNUSED(flags); + quicklistIter *iter; + TEST("numbers larger list read B"); + + for (int _i = 0; _i < option_count; _i++) { + long long start = mstime(); + quicklist *ql = quicklistNew(-2, options[_i]); + quicklistPushTail(ql, "99", 2); + quicklistPushTail(ql, "98", 2); + quicklistPushTail(ql, "xxxxxxxxxxxxxxxxxxxx", 20); + quicklistPushTail(ql, "96", 2); + quicklistPushTail(ql, "95", 2); + quicklistReplaceAtIndex(ql, 1, "foo", 3); + quicklistReplaceAtIndex(ql, -1, "bar", 3); + quicklistRelease(ql); + + long long stop = mstime(); + runtime[_i] += stop - start; + } + UNUSED(iter); + TEST_ASSERT(err == 0); + return 0; +} + +int test_quicklistLremTestAtCompress(int argc, char **argv, int flags) { + UNUSED(argc); + UNUSED(argv); + UNUSED(flags); + quicklistIter *iter; + TEST("lrem test at compress"); + for (int _i = 0; _i < option_count; _i++) { + long long start = mstime(); + for (int f = 0; f < fill_count; f++) { + quicklist *ql = quicklistNew(fills[f], options[_i]); + char *words[] = {"abc", "foo", "bar", "foobar", "foobared", "zap", "bar", "test", "foo"}; + char *result[] = {"abc", "foo", "foobar", "foobared", "zap", "test", "foo"}; + char *resultB[] = {"abc", "foo", "foobar", "foobared", "zap", "test"}; + for (int i = 0; i < 9; i++) quicklistPushTail(ql, words[i], strlen(words[i])); + + /* lrem 0 bar */ + quicklistIter *iter = quicklistGetIterator(ql, AL_START_HEAD); + quicklistEntry entry; + int i = 0; + while (quicklistNext(iter, &entry)) { + if (quicklistCompare(&entry, (unsigned char *)"bar", 3)) { + quicklistDelEntry(iter, &entry); + } + i++; + } + ql_release_iterator(iter); + + /* check result of lrem 0 bar */ + iter = quicklistGetIterator(ql, AL_START_HEAD); + i = 0; + while (quicklistNext(iter, &entry)) { + /* Result must be: abc, foo, foobar, foobared, zap, test, + * foo */ + int sz = entry.sz; + if (strncmp((char *)entry.value, result[i], entry.sz)) { + TEST_PRINT_INFO("No match at position %d, got %.*s instead of %s", i, sz, entry.value, result[i]); + err++; + } + i++; + } + ql_release_iterator(iter); + + quicklistPushTail(ql, "foo", 3); + + /* lrem -2 foo */ + iter = quicklistGetIterator(ql, AL_START_TAIL); + i = 0; + int del = 2; + while (quicklistNext(iter, &entry)) { + if (quicklistCompare(&entry, (unsigned char *)"foo", 3)) { + quicklistDelEntry(iter, &entry); + del--; + } + if (!del) break; + i++; + } + ql_release_iterator(iter); + + /* check result of lrem -2 foo */ + /* (we're ignoring the '2' part and still deleting all foo + * because + * we only have two foo) */ + iter = quicklistGetIterator(ql, AL_START_TAIL); + i = 0; + size_t resB = sizeof(resultB) / sizeof(*resultB); + while (quicklistNext(iter, &entry)) { + /* Result must be: abc, foo, foobar, foobared, zap, test, + * foo */ + int sz = entry.sz; + if (strncmp((char *)entry.value, resultB[resB - 1 - i], sz)) { + TEST_PRINT_INFO("No match at position %d, got %.*s instead of %s", i, sz, entry.value, + resultB[resB - 1 - i]); + err++; + } + i++; + } + + ql_release_iterator(iter); + quicklistRelease(ql); + } + + long long stop = mstime(); + runtime[_i] += stop - start; + } + UNUSED(iter); + TEST_ASSERT(err == 0); + return 0; +} + +int test_quicklistIterateReverseDeleteAtCompress(int argc, char **argv, int flags) { + UNUSED(argc); + UNUSED(argv); + UNUSED(flags); + quicklistIter *iter; + TEST("iterate reverse + delete at compress"); + + for (int _i = 0; _i < option_count; _i++) { + long long start = mstime(); + for (int f = 0; f < fill_count; f++) { + quicklist *ql = quicklistNew(fills[f], options[_i]); + quicklistPushTail(ql, "abc", 3); + quicklistPushTail(ql, "def", 3); + quicklistPushTail(ql, "hij", 3); + quicklistPushTail(ql, "jkl", 3); + quicklistPushTail(ql, "oop", 3); + + quicklistEntry entry; + quicklistIter *iter = quicklistGetIterator(ql, AL_START_TAIL); + int i = 0; + while (quicklistNext(iter, &entry)) { + if (quicklistCompare(&entry, (unsigned char *)"hij", 3)) { + quicklistDelEntry(iter, &entry); + } + i++; + } + ql_release_iterator(iter); + + if (i != 5) { + TEST_PRINT_INFO("Didn't iterate 5 times, iterated %d times.", i); + err++; + } + + /* Check results after deletion of "hij" */ + iter = quicklistGetIterator(ql, AL_START_HEAD); + i = 0; + char *vals[] = {"abc", "def", "jkl", "oop"}; + while (quicklistNext(iter, &entry)) { + if (!quicklistCompare(&entry, (unsigned char *)vals[i], 3)) { + TEST_PRINT_INFO("Value at %d didn't match %s\n", i, vals[i]); + err++; + } + i++; + } + ql_release_iterator(iter); + quicklistRelease(ql); + } + + long long stop = mstime(); + runtime[_i] += stop - start; + } + UNUSED(iter); + TEST_ASSERT(err == 0); + return 0; +} + +int test_quicklistIteratorAtIndexTestAtCompress(int argc, char **argv, int flags) { + UNUSED(argc); + UNUSED(argv); + UNUSED(flags); + quicklistIter *iter; + TEST("iterator at index test at compress"); + + for (int _i = 0; _i < option_count; _i++) { + long long start = mstime(); + for (int f = 0; f < fill_count; f++) { + quicklist *ql = quicklistNew(fills[f], options[_i]); + char num[32]; + long long nums[5000]; + for (int i = 0; i < 760; i++) { + nums[i] = -5157318210846258176 + i; + int sz = ll2string(num, sizeof(num), nums[i]); + quicklistPushTail(ql, num, sz); + } + + quicklistEntry entry; + quicklistIter *iter = quicklistGetIteratorAtIdx(ql, AL_START_HEAD, 437); + int i = 437; + while (quicklistNext(iter, &entry)) { + if (entry.longval != nums[i]) { + TEST_PRINT_INFO("Expected %lld, but got %lld", entry.longval, nums[i]); + err++; + } + i++; + } + ql_release_iterator(iter); + quicklistRelease(ql); + } + + long long stop = mstime(); + runtime[_i] += stop - start; + } + UNUSED(iter); + TEST_ASSERT(err == 0); + return 0; +} + +int test_quicklistLtrimTestAAtCompress(int argc, char **argv, int flags) { + UNUSED(argc); + UNUSED(argv); + UNUSED(flags); + quicklistIter *iter; + TEST("ltrim test A at compress"); + + for (int _i = 0; _i < option_count; _i++) { + long long start = mstime(); + for (int f = 0; f < fill_count; f++) { + quicklist *ql = quicklistNew(fills[f], options[_i]); + char num[32]; + long long nums[5000]; + for (int i = 0; i < 32; i++) { + nums[i] = -5157318210846258176 + i; + int sz = ll2string(num, sizeof(num), nums[i]); + quicklistPushTail(ql, num, sz); + } + if (fills[f] == 32) ql_verify(ql, 1, 32, 32, 32); + /* ltrim 25 53 (keep [25,32] inclusive = 7 remaining) */ + quicklistDelRange(ql, 0, 25); + quicklistDelRange(ql, 0, 0); + quicklistEntry entry; + for (int i = 0; i < 7; i++) { + iter = quicklistGetIteratorEntryAtIdx(ql, i, &entry); + if (entry.longval != nums[25 + i]) { + TEST_PRINT_INFO("Deleted invalid range! Expected %lld but got " + "%lld", + entry.longval, nums[25 + i]); + err++; + } + ql_release_iterator(iter); + } + if (fills[f] == 32) ql_verify(ql, 1, 7, 7, 7); + quicklistRelease(ql); + } + + long long stop = mstime(); + runtime[_i] += stop - start; + } + UNUSED(iter); + TEST_ASSERT(err == 0); + return 0; +} + +int test_quicklistLtrimTestBAtCompress(int argc, char **argv, int flags) { + UNUSED(argc); + UNUSED(argv); + UNUSED(flags); + quicklistIter *iter; + TEST("ltrim test B at compress"); + + for (int _i = 0; _i < option_count; _i++) { + long long start = mstime(); + for (int f = 0; f < fill_count; f++) { + /* Force-disable compression because our 33 sequential + * integers don't compress and the check always fails. */ + quicklist *ql = quicklistNew(fills[f], QUICKLIST_NOCOMPRESS); + char num[32]; + long long nums[5000]; + for (int i = 0; i < 33; i++) { + nums[i] = i; + int sz = ll2string(num, sizeof(num), nums[i]); + quicklistPushTail(ql, num, sz); + } + if (fills[f] == 32) ql_verify(ql, 2, 33, 32, 1); + /* ltrim 5 16 (keep [5,16] inclusive = 12 remaining) */ + quicklistDelRange(ql, 0, 5); + quicklistDelRange(ql, -16, 16); + if (fills[f] == 32) ql_verify(ql, 1, 12, 12, 12); + quicklistEntry entry; + + iter = quicklistGetIteratorEntryAtIdx(ql, 0, &entry); + if (entry.longval != 5) { + TEST_PRINT_INFO("A: longval not 5, but %lld", entry.longval); + err++; + } + ql_release_iterator(iter); + + iter = quicklistGetIteratorEntryAtIdx(ql, -1, &entry); + if (entry.longval != 16) { + TEST_PRINT_INFO("B! got instead: %lld", entry.longval); + err++; + } + quicklistPushTail(ql, "bobobob", 7); + ql_release_iterator(iter); + + iter = quicklistGetIteratorEntryAtIdx(ql, -1, &entry); + int sz = entry.sz; + if (strncmp((char *)entry.value, "bobobob", 7)) { + TEST_PRINT_INFO("Tail doesn't match bobobob, it's %.*s instead", sz, entry.value); + err++; + } + ql_release_iterator(iter); + + for (int i = 0; i < 12; i++) { + iter = quicklistGetIteratorEntryAtIdx(ql, i, &entry); + if (entry.longval != nums[5 + i]) { + TEST_PRINT_INFO("Deleted invalid range! Expected %lld but got " + "%lld", + entry.longval, nums[5 + i]); + err++; + } + + ql_release_iterator(iter); + } + quicklistRelease(ql); + } + + long long stop = mstime(); + runtime[_i] += stop - start; + } + UNUSED(iter); + TEST_ASSERT(err == 0); + return 0; +} + +int test_quicklistLtrimTestCAtCompress(int argc, char **argv, int flags) { + UNUSED(argc); + UNUSED(argv); + UNUSED(flags); + quicklistIter *iter; + TEST("ltrim test C at compress"); + + for (int _i = 0; _i < option_count; _i++) { + long long start = mstime(); + for (int f = 0; f < fill_count; f++) { + quicklist *ql = quicklistNew(fills[f], options[_i]); + char num[32]; + long long nums[5000]; + for (int i = 0; i < 33; i++) { + nums[i] = -5157318210846258176 + i; + int sz = ll2string(num, sizeof(num), nums[i]); + quicklistPushTail(ql, num, sz); + } + if (fills[f] == 32) ql_verify(ql, 2, 33, 32, 1); + /* ltrim 3 3 (keep [3,3] inclusive = 1 remaining) */ + quicklistDelRange(ql, 0, 3); + quicklistDelRange(ql, -29, 4000); /* make sure not loop forever */ + if (fills[f] == 32) ql_verify(ql, 1, 1, 1, 1); + quicklistEntry entry; + iter = quicklistGetIteratorEntryAtIdx(ql, 0, &entry); + if (entry.longval != -5157318210846258173) { + err++; + } + ql_release_iterator(iter); + quicklistRelease(ql); + } + + long long stop = mstime(); + runtime[_i] += stop - start; + } + UNUSED(iter); + TEST_ASSERT(err == 0); + return 0; +} + +int test_quicklistLtrimTestDAtCompress(int argc, char **argv, int flags) { + UNUSED(argc); + UNUSED(argv); + UNUSED(flags); + quicklistIter *iter; + TEST("ltrim test D at compress"); + + for (int _i = 0; _i < option_count; _i++) { + long long start = mstime(); + for (int f = 0; f < fill_count; f++) { + quicklist *ql = quicklistNew(fills[f], options[_i]); + char num[32]; + long long nums[5000]; + for (int i = 0; i < 33; i++) { + nums[i] = -5157318210846258176 + i; + int sz = ll2string(num, sizeof(num), nums[i]); + quicklistPushTail(ql, num, sz); + } + if (fills[f] == 32) ql_verify(ql, 2, 33, 32, 1); + quicklistDelRange(ql, -12, 3); + if (ql->count != 30) { + TEST_PRINT_INFO("Didn't delete exactly three elements! Count is: %lu", ql->count); + err++; + } + quicklistRelease(ql); + } + + long long stop = mstime(); + runtime[_i] += stop - start; + } + UNUSED(iter); + TEST_ASSERT(err == 0); + return 0; +} + +int test_quicklistVerifySpecificCompressionOfInteriorNodes(int argc, char **argv, int flags) { + UNUSED(argc); + UNUSED(argv); + UNUSED(flags); + int accurate = flags & UNIT_TEST_ACCURATE; + + TEST("verify specific compression of interior nodes"); + + /* Run a longer test of compression depth outside of primary test loop. */ + int list_sizes[] = {250, 251, 500, 999, 1000}; + int list_count = accurate ? (int)(sizeof(list_sizes) / sizeof(*list_sizes)) : 1; + for (int list = 0; list < list_count; list++) { + for (int f = 0; f < fill_count; f++) { + for (int depth = 1; depth < 40; depth++) { + /* skip over many redundant test cases */ + quicklist *ql = quicklistNew(fills[f], depth); + for (int i = 0; i < list_sizes[list]; i++) { + quicklistPushTail(ql, genstr("hello TAIL", i + 1), 64); + quicklistPushHead(ql, genstr("hello HEAD", i + 1), 64); + } + + for (int step = 0; step < 2; step++) { + /* test remove node */ + if (step == 1) { + for (int i = 0; i < list_sizes[list] / 2; i++) { + unsigned char *data; + TEST_ASSERT(quicklistPop(ql, QUICKLIST_HEAD, &data, + NULL, NULL)); + zfree(data); + TEST_ASSERT(quicklistPop(ql, QUICKLIST_TAIL, &data, + NULL, NULL)); + zfree(data); + } + } + quicklistNode *node = ql->head; + unsigned int low_raw = ql->compress; + unsigned int high_raw = ql->len - ql->compress; + + for (unsigned int at = 0; at < ql->len; + at++, node = node->next) { + if (at < low_raw || at >= high_raw) { + if (node->encoding != QUICKLIST_NODE_ENCODING_RAW) { + TEST_PRINT_INFO("Incorrect compression: node %d is " + "compressed at depth %d ((%u, %u); total " + "nodes: %lu; size: %zu)", + at, depth, low_raw, high_raw, ql->len, + node->sz); + err++; + } + } else { + if (node->encoding != QUICKLIST_NODE_ENCODING_LZF) { + TEST_PRINT_INFO("Incorrect non-compression: node %d is NOT " + "compressed at depth %d ((%u, %u); total " + "nodes: %lu; size: %zu; attempted: %d)", + at, depth, low_raw, high_raw, ql->len, + node->sz, node->attempted_compress); + err++; + } + } + } + } + + quicklistRelease(ql); + } + } + } + TEST_ASSERT(err == 0); + return 0; +} + +/*----------------------------------------------------------------------------- + * Quicklist Bookmark Unit Test + *----------------------------------------------------------------------------*/ + +int test_quicklistBookmarkGetUpdatedToNextItem(int argc, char **argv, int flags) { + UNUSED(argc); + UNUSED(argv); + UNUSED(flags); + TEST("bookmark get updated to next item"); + + quicklist *ql = quicklistNew(1, 0); + quicklistPushTail(ql, "1", 1); + quicklistPushTail(ql, "2", 1); + quicklistPushTail(ql, "3", 1); + quicklistPushTail(ql, "4", 1); + quicklistPushTail(ql, "5", 1); + TEST_ASSERT(ql->len == 5); + /* add two bookmarks, one pointing to the node before the last. */ + TEST_ASSERT(quicklistBookmarkCreate(&ql, "_dummy", ql->head->next)); + TEST_ASSERT(quicklistBookmarkCreate(&ql, "_test", ql->tail->prev)); + /* test that the bookmark returns the right node, delete it and see that the bookmark points to the last node */ + TEST_ASSERT(quicklistBookmarkFind(ql, "_test") == ql->tail->prev); + TEST_ASSERT(quicklistDelRange(ql, -2, 1)); + TEST_ASSERT(quicklistBookmarkFind(ql, "_test") == ql->tail); + /* delete the last node, and see that the bookmark was deleted. */ + TEST_ASSERT(quicklistDelRange(ql, -1, 1)); + TEST_ASSERT(quicklistBookmarkFind(ql, "_test") == NULL); + /* test that other bookmarks aren't affected */ + TEST_ASSERT(quicklistBookmarkFind(ql, "_dummy") == ql->head->next); + TEST_ASSERT(quicklistBookmarkFind(ql, "_missing") == NULL); + TEST_ASSERT(ql->len == 3); + quicklistBookmarksClear(ql); /* for coverage */ + TEST_ASSERT(quicklistBookmarkFind(ql, "_dummy") == NULL); + quicklistRelease(ql); + return 0; +} + +int test_quicklistBookmarkLimit(int argc, char **argv, int flags) { + UNUSED(argc); + UNUSED(argv); + UNUSED(flags); + TEST("bookmark limit"); + + int i; + quicklist *ql = quicklistNew(1, 0); + quicklistPushHead(ql, "1", 1); + for (i = 0; i < QL_MAX_BM; i++) + TEST_ASSERT(quicklistBookmarkCreate(&ql, genstr("", i), ql->head)); + /* when all bookmarks are used, creation fails */ + TEST_ASSERT(!quicklistBookmarkCreate(&ql, "_test", ql->head)); + /* delete one and see that we can now create another */ + TEST_ASSERT(quicklistBookmarkDelete(ql, "0")); + TEST_ASSERT(quicklistBookmarkCreate(&ql, "_test", ql->head)); + /* delete one and see that the rest survive */ + TEST_ASSERT(quicklistBookmarkDelete(ql, "_test")); + for (i = 1; i < QL_MAX_BM; i++) + TEST_ASSERT(quicklistBookmarkFind(ql, genstr("", i)) == ql->head); + /* make sure the deleted ones are indeed gone */ + TEST_ASSERT(!quicklistBookmarkFind(ql, "0")); + TEST_ASSERT(!quicklistBookmarkFind(ql, "_test")); + quicklistRelease(ql); + return 0; +} + +int test_quicklistCompressAndDecompressQuicklistListpackNode(int argc, char **argv, int flags) { + UNUSED(argc); + UNUSED(argv); + TEST("compress and decompress quicklist listpack node"); + + if (!(flags & UNIT_TEST_LARGE_MEMORY)) return 0; + + quicklistNode *node = quicklistCreateNode(); + node->entry = lpNew(0); + + /* Just to avoid triggering the assertion in __quicklistCompressNode(), + * it disables the passing of quicklist head or tail node. */ + node->prev = quicklistCreateNode(); + node->next = quicklistCreateNode(); + + /* Create a rand string */ + size_t sz = (1 << 25); /* 32MB per one entry */ + unsigned char *s = zmalloc(sz); + randstring(s, sz); + + /* Keep filling the node, until it reaches 1GB */ + for (int i = 0; i < 32; i++) { + node->entry = lpAppend(node->entry, s, sz); + node->sz = lpBytes((node)->entry); + + long long start = mstime(); + TEST_ASSERT(__quicklistCompressNode(node)); + TEST_ASSERT(__quicklistDecompressNode(node)); + TEST_PRINT_INFO("Compress and decompress: %zu MB in %.2f seconds.\n", + node->sz / 1024 / 1024, (float)(mstime() - start) / 1000); + } + + zfree(s); + zfree(node->prev); + zfree(node->next); + zfree(node->entry); + zfree(node); + return 0; +} + +int test_quicklistCompressAndDecomressQuicklistPlainNodeLargeThanUINT32MAX(int argc, char **argv, int flags) { + UNUSED(argc); + UNUSED(argv); + TEST("compress and decomress quicklist plain node large than UINT32_MAX"); + + if (!(flags & UNIT_TEST_LARGE_MEMORY)) return 0; + +#if ULONG_MAX >= 0xffffffffffffffff + + size_t sz = (1ull << 32); + unsigned char *s = zmalloc(sz); + randstring(s, sz); + memcpy(s, "helloworld", 10); + memcpy(s + sz - 10, "1234567890", 10); + + quicklistNode *node = __quicklistCreateNode(QUICKLIST_NODE_CONTAINER_PLAIN, s, sz); + + /* Just to avoid triggering the assertion in __quicklistCompressNode(), + * it disables the passing of quicklist head or tail node. */ + node->prev = quicklistCreateNode(); + node->next = quicklistCreateNode(); + + long long start = mstime(); + TEST_ASSERT(__quicklistCompressNode(node)); + TEST_ASSERT(__quicklistDecompressNode(node)); + TEST_PRINT_INFO("Compress and decompress: %zu MB in %.2f seconds.\n", + node->sz / 1024 / 1024, (float)(mstime() - start) / 1000); + + TEST_ASSERT(memcmp(node->entry, "helloworld", 10) == 0); + TEST_ASSERT(memcmp(node->entry + sz - 10, "1234567890", 10) == 0); + zfree(node->prev); + zfree(node->next); + zfree(node->entry); + zfree(node); + +#endif + return 0; +} diff --git a/src/unit/test_sds.c b/src/unit/test_sds.c index 19b5c7d73f..30f25e4f6f 100644 --- a/src/unit/test_sds.c +++ b/src/unit/test_sds.c @@ -259,43 +259,44 @@ int test_typesAndAllocSize(int argc, char **argv, int flags) { sds x = sdsnewlen(NULL, 31); TEST_ASSERT_MESSAGE("len 31 type", (x[-1] & SDS_TYPE_MASK) == SDS_TYPE_5); + TEST_ASSERT_MESSAGE("len 31 sdsAllocSize", sdsAllocSize(x) == s_malloc_usable_size(sdsAllocPtr(x))); sdsfree(x); x = sdsnewlen(NULL, 32); TEST_ASSERT_MESSAGE("len 32 type", (x[-1] & SDS_TYPE_MASK) >= SDS_TYPE_8); - TEST_ASSERT_MESSAGE("len 32 sdsAllocSize", sdsAllocSize(x) == s_malloc_size(sdsAllocPtr(x))); + TEST_ASSERT_MESSAGE("len 32 sdsAllocSize", sdsAllocSize(x) == s_malloc_usable_size(sdsAllocPtr(x))); sdsfree(x); x = sdsnewlen(NULL, 252); TEST_ASSERT_MESSAGE("len 252 type", (x[-1] & SDS_TYPE_MASK) >= SDS_TYPE_8); - TEST_ASSERT_MESSAGE("len 252 sdsAllocSize", sdsAllocSize(x) == s_malloc_size(sdsAllocPtr(x))); + TEST_ASSERT_MESSAGE("len 252 sdsAllocSize", sdsAllocSize(x) == s_malloc_usable_size(sdsAllocPtr(x))); sdsfree(x); x = sdsnewlen(NULL, 253); TEST_ASSERT_MESSAGE("len 253 type", (x[-1] & SDS_TYPE_MASK) == SDS_TYPE_16); - TEST_ASSERT_MESSAGE("len 253 sdsAllocSize", sdsAllocSize(x) == s_malloc_size(sdsAllocPtr(x))); + TEST_ASSERT_MESSAGE("len 253 sdsAllocSize", sdsAllocSize(x) == s_malloc_usable_size(sdsAllocPtr(x))); sdsfree(x); x = sdsnewlen(NULL, 65530); TEST_ASSERT_MESSAGE("len 65530 type", (x[-1] & SDS_TYPE_MASK) >= SDS_TYPE_16); - TEST_ASSERT_MESSAGE("len 65530 sdsAllocSize", sdsAllocSize(x) == s_malloc_size(sdsAllocPtr(x))); + TEST_ASSERT_MESSAGE("len 65530 sdsAllocSize", sdsAllocSize(x) == s_malloc_usable_size(sdsAllocPtr(x))); sdsfree(x); x = sdsnewlen(NULL, 65531); TEST_ASSERT_MESSAGE("len 65531 type", (x[-1] & SDS_TYPE_MASK) >= SDS_TYPE_32); - TEST_ASSERT_MESSAGE("len 65531 sdsAllocSize", sdsAllocSize(x) == s_malloc_size(sdsAllocPtr(x))); + TEST_ASSERT_MESSAGE("len 65531 sdsAllocSize", sdsAllocSize(x) == s_malloc_usable_size(sdsAllocPtr(x))); sdsfree(x); #if (LONG_MAX == LLONG_MAX) if (flags & UNIT_TEST_LARGE_MEMORY) { x = sdsnewlen(NULL, 4294967286); TEST_ASSERT_MESSAGE("len 4294967286 type", (x[-1] & SDS_TYPE_MASK) >= SDS_TYPE_32); - TEST_ASSERT_MESSAGE("len 4294967286 sdsAllocSize", sdsAllocSize(x) == s_malloc_size(sdsAllocPtr(x))); + TEST_ASSERT_MESSAGE("len 4294967286 sdsAllocSize", sdsAllocSize(x) == s_malloc_usable_size(sdsAllocPtr(x))); sdsfree(x); x = sdsnewlen(NULL, 4294967287); TEST_ASSERT_MESSAGE("len 4294967287 type", (x[-1] & SDS_TYPE_MASK) == SDS_TYPE_64); - TEST_ASSERT_MESSAGE("len 4294967287 sdsAllocSize", sdsAllocSize(x) == s_malloc_size(sdsAllocPtr(x))); + TEST_ASSERT_MESSAGE("len 4294967287 sdsAllocSize", sdsAllocSize(x) == s_malloc_usable_size(sdsAllocPtr(x))); sdsfree(x); } #endif @@ -328,3 +329,44 @@ int test_sdsHeaderSizes(int argc, char **argv, int flags) { return 0; } + +int test_sdssplitargs(int argc, char **argv, int flags) { + UNUSED(argc); + UNUSED(argv); + UNUSED(flags); + + int len; + sds *sargv; + + sargv = sdssplitargs("Testing one two three", &len); + TEST_ASSERT(4 == len); + TEST_ASSERT(!strcmp("Testing", sargv[0])); + TEST_ASSERT(!strcmp("one", sargv[1])); + TEST_ASSERT(!strcmp("two", sargv[2])); + TEST_ASSERT(!strcmp("three", sargv[3])); + sdsfreesplitres(sargv, len); + + sargv = sdssplitargs("", &len); + TEST_ASSERT(0 == len); + TEST_ASSERT(sargv != NULL); + sdsfreesplitres(sargv, len); + + sargv = sdssplitargs("\"Testing split strings\" \'Another split string\'", &len); + TEST_ASSERT(2 == len); + TEST_ASSERT(!strcmp("Testing split strings", sargv[0])); + TEST_ASSERT(!strcmp("Another split string", sargv[1])); + sdsfreesplitres(sargv, len); + + sargv = sdssplitargs("\"Hello\" ", &len); + TEST_ASSERT(1 == len); + TEST_ASSERT(!strcmp("Hello", sargv[0])); + sdsfreesplitres(sargv, len); + + char *binary_string = "\"\\x73\\x75\\x70\\x65\\x72\\x20\\x00\\x73\\x65\\x63\\x72\\x65\\x74\\x20\\x70\\x61\\x73\\x73\\x77\\x6f\\x72\\x64\""; + sargv = sdssplitargs(binary_string, &len); + TEST_ASSERT(1 == len); + TEST_ASSERT(22 == sdslen(sargv[0])); + sdsfreesplitres(sargv, len); + + return 0; +} diff --git a/src/unit/test_util.c b/src/unit/test_util.c index 70be0255d8..4558c38c3b 100644 --- a/src/unit/test_util.c +++ b/src/unit/test_util.c @@ -286,7 +286,9 @@ static int cache_exist(int fd) { int test_reclaimFilePageCache(int argc, char **argv, int flags) { UNUSED(argc); UNUSED(argv); - UNUSED(flags); + + /* The test is incompatible with valgrind, skip it. */ + if (flags & UNIT_TEST_VALGRIND) return 0; #if defined(__linux__) char *tmpfile = "/tmp/redis-reclaim-cache-test"; diff --git a/src/unit/test_valkey_strtod.c b/src/unit/test_valkey_strtod.c new file mode 100644 index 0000000000..4796d7a5b6 --- /dev/null +++ b/src/unit/test_valkey_strtod.c @@ -0,0 +1,36 @@ +/* + * Copyright Valkey Contributors. + * All rights reserved. + * SPDX-License-Identifier: BSD 3-Clause + */ + + +#include "../valkey_strtod.h" +#include "errno.h" +#include "math.h" +#include "test_help.h" + +int test_valkey_strtod(int argc, char **argv, int flags) { + UNUSED(argc); + UNUSED(argv); + UNUSED(flags); + + errno = 0; + double value = valkey_strtod("231.2341234", NULL); + TEST_ASSERT(value == 231.2341234); + TEST_ASSERT(errno == 0); + + value = valkey_strtod("+inf", NULL); + TEST_ASSERT(isinf(value)); + TEST_ASSERT(errno == 0); + + value = valkey_strtod("-inf", NULL); + TEST_ASSERT(isinf(value)); + TEST_ASSERT(errno == 0); + + value = valkey_strtod("inf", NULL); + TEST_ASSERT(isinf(value)); + TEST_ASSERT(errno == 0); + + return 0; +} diff --git a/src/unit/test_zmalloc.c b/src/unit/test_zmalloc.c index 6c1d03e8e1..08444a157e 100644 --- a/src/unit/test_zmalloc.c +++ b/src/unit/test_zmalloc.c @@ -6,6 +6,8 @@ int test_zmallocInitialUsedMemory(int argc, char **argv, int flags) { UNUSED(argv); UNUSED(flags); + /* If this fails, it may be that other tests have failed and the memory has not been released. */ + TEST_PRINT_INFO("test_zmallocInitialUsedMemory; used: %zu\n", zmalloc_used_memory()); TEST_ASSERT(zmalloc_used_memory() == 0); return 0; diff --git a/src/unix.c b/src/unix.c index 35778779f9..86df05bd52 100644 --- a/src/unix.c +++ b/src/unix.c @@ -74,6 +74,10 @@ static int connUnixListen(connListener *listener) { return C_OK; } +static void connUnixCloseListener(connListener *listener) { + connectionTypeTcp()->closeListener(listener); +} + static connection *connCreateUnix(void) { connection *conn = zcalloc(sizeof(connection)); conn->type = &CT_Unix; @@ -174,6 +178,7 @@ static ConnectionType CT_Unix = { .addr = connUnixAddr, .is_local = connUnixIsLocal, .listen = connUnixListen, + .closeListener = connUnixCloseListener, /* create/shutdown/close connection */ .conn_create = connCreateUnix, diff --git a/src/util.c b/src/util.c index b1235c2822..0b7af2d3fa 100644 --- a/src/util.c +++ b/src/util.c @@ -51,6 +51,8 @@ #include "sha256.h" #include "config.h" +#include "valkey_strtod.h" + #define UNUSED(x) ((void)(x)) /* Glob-style pattern matching. */ @@ -595,10 +597,12 @@ int string2ld(const char *s, size_t slen, long double *dp) { int string2d(const char *s, size_t slen, double *dp) { errno = 0; char *eptr; - *dp = strtod(s, &eptr); + *dp = valkey_strtod(s, &eptr); if (slen == 0 || isspace(((const char *)s)[0]) || (size_t)(eptr - (char *)s) != slen || - (errno == ERANGE && (*dp == HUGE_VAL || *dp == -HUGE_VAL || fpclassify(*dp) == FP_ZERO)) || isnan(*dp)) + (errno == ERANGE && (*dp == HUGE_VAL || *dp == -HUGE_VAL || fpclassify(*dp) == FP_ZERO)) || isnan(*dp) || errno == EINVAL) { + errno = 0; return 0; + } return 1; } diff --git a/src/valkey-benchmark.c b/src/valkey-benchmark.c index b22ee8cbed..57cdd6fc16 100644 --- a/src/valkey-benchmark.c +++ b/src/valkey-benchmark.c @@ -199,7 +199,7 @@ static long long showThroughput(struct aeEventLoop *eventLoop, long long id, voi /* Dict callbacks */ static uint64_t dictSdsHash(const void *key); -static int dictSdsKeyCompare(dict *d, const void *key1, const void *key2); +static int dictSdsKeyCompare(const void *key1, const void *key2); /* Implementation */ static long long ustime(void) { @@ -220,10 +220,8 @@ static uint64_t dictSdsHash(const void *key) { return dictGenHashFunction((unsigned char *)key, sdslen((char *)key)); } -static int dictSdsKeyCompare(dict *d, const void *key1, const void *key2) { +static int dictSdsKeyCompare(const void *key1, const void *key2) { int l1, l2; - UNUSED(d); - l1 = sdslen((sds)key1); l2 = sdslen((sds)key2); if (l1 != l2) return 0; diff --git a/src/valkey-cli.c b/src/valkey-cli.c index b4a7fcaf91..4416e09431 100644 --- a/src/valkey-cli.c +++ b/src/valkey-cli.c @@ -65,6 +65,8 @@ #include "mt19937-64.h" #include "cli_commands.h" +#include "valkey_strtod.h" + #define UNUSED(V) ((void)V) #define OUTPUT_STANDARD 0 @@ -172,9 +174,9 @@ static struct termios orig_termios; /* To restore terminal at exit.*/ /* Dict Helpers */ static uint64_t dictSdsHash(const void *key); -static int dictSdsKeyCompare(dict *d, const void *key1, const void *key2); -static void dictSdsDestructor(dict *d, void *val); -static void dictListDestructor(dict *d, void *val); +static int dictSdsKeyCompare(const void *key1, const void *key2); +static void dictSdsDestructor(void *val); +static void dictListDestructor(void *val); /* Cluster Manager Command Info */ typedef struct clusterManagerCommand { @@ -371,23 +373,19 @@ static uint64_t dictSdsHash(const void *key) { return dictGenHashFunction((unsigned char *)key, sdslen((char *)key)); } -static int dictSdsKeyCompare(dict *d, const void *key1, const void *key2) { +static int dictSdsKeyCompare(const void *key1, const void *key2) { int l1, l2; - UNUSED(d); - l1 = sdslen((sds)key1); l2 = sdslen((sds)key2); if (l1 != l2) return 0; return memcmp(key1, key2, l1) == 0; } -static void dictSdsDestructor(dict *d, void *val) { - UNUSED(d); +static void dictSdsDestructor(void *val) { sdsfree(val); } -void dictListDestructor(dict *d, void *val) { - UNUSED(d); +void dictListDestructor(void *val) { listRelease((list *)val); } @@ -2541,9 +2539,10 @@ static int parseOptions(int argc, char **argv) { exit(1); } } else if (!strcmp(argv[i], "-t") && !lastarg) { + errno = 0; char *eptr; - double seconds = strtod(argv[++i], &eptr); - if (eptr[0] != '\0' || isnan(seconds) || seconds < 0.0) { + double seconds = valkey_strtod(argv[++i], &eptr); + if (eptr[0] != '\0' || isnan(seconds) || seconds < 0.0 || errno == EINVAL || errno == ERANGE) { fprintf(stderr, "Invalid connection timeout for -t.\n"); exit(1); } @@ -4395,7 +4394,7 @@ static sds clusterManagerNodeInfo(clusterManagerNode *node, int indent) { if (node->replicate != NULL) info = sdscatfmt(info, "\n%s replicates %S", spaces, node->replicate); else if (node->replicas_count) - info = sdscatfmt(info, "\n%s %U additional replica(s)", spaces, node->replicas_count); + info = sdscatfmt(info, "\n%s %i additional replica(s)", spaces, node->replicas_count); sdsfree(spaces); return info; } @@ -8663,9 +8662,8 @@ static typeinfo *typeinfo_add(dict *types, char *name, typeinfo *type_template) return info; } -void type_free(dict *d, void *val) { +void type_free(void *val) { typeinfo *info = val; - UNUSED(d); if (info->biggest_key) sdsfree(info->biggest_key); sdsfree(info->name); zfree(info); diff --git a/src/valkey_strtod.h b/src/valkey_strtod.h new file mode 100644 index 0000000000..037a3f3cec --- /dev/null +++ b/src/valkey_strtod.h @@ -0,0 +1,42 @@ +#ifndef FAST_FLOAT_STRTOD_H +#define FAST_FLOAT_STRTOD_H + +#ifdef USE_FAST_FLOAT + +#include "errno.h" + +/** + * Converts a null-terminated byte string to a double using the fast_float library. + * + * This function provides a C-compatible wrapper around the fast_float library's string-to-double + * conversion functionality. It aims to offer a faster alternative to the standard strtod function. + * + * str: A pointer to the null-terminated byte string to be converted. + * eptr: On success, stores char pointer pointing to '\0' at the end of the string. + * On failure, stores char pointer pointing to first invalid character in the string. + * returns: On success, the function returns the converted double value. + * On failure, it returns 0.0 and stores error code in errno to ERANGE or EINVAL. + * + * note: This function uses the fast_float library (https://github.com/fastfloat/fast_float) for + * the actual conversion, which can be significantly faster than standard library functions. + * Refer to "../deps/fast_float_c_interface" for more details. + * Refer to https://github.com/fastfloat/fast_float for more information on the underlying library. + */ +double fast_float_strtod(const char *str, char **endptr); + +static inline double valkey_strtod(const char *str, char **endptr) { + errno = 0; + return fast_float_strtod(str, endptr); +} + +#else + +#include + +static inline double valkey_strtod(const char *str, char **endptr) { + return strtod(str, endptr); +} + +#endif + +#endif // FAST_FLOAT_STRTOD_H diff --git a/src/valkeymodule.h b/src/valkeymodule.h index c2cdb2f0e7..7c3adfd477 100644 --- a/src/valkeymodule.h +++ b/src/valkeymodule.h @@ -967,6 +967,7 @@ VALKEYMODULE_API void (*ValkeyModule_SetModuleAttribs)(ValkeyModuleCtx *ctx, con VALKEYMODULE_ATTR; VALKEYMODULE_API int (*ValkeyModule_IsModuleNameBusy)(const char *name) VALKEYMODULE_ATTR; VALKEYMODULE_API int (*ValkeyModule_WrongArity)(ValkeyModuleCtx *ctx) VALKEYMODULE_ATTR; +VALKEYMODULE_API int (*ValkeyModule_UpdateRuntimeArgs)(ValkeyModuleCtx *ctx, ValkeyModuleString **argv, int argc) VALKEYMODULE_ATTR; VALKEYMODULE_API int (*ValkeyModule_ReplyWithLongLong)(ValkeyModuleCtx *ctx, long long ll) VALKEYMODULE_ATTR; VALKEYMODULE_API int (*ValkeyModule_GetSelectedDb)(ValkeyModuleCtx *ctx) VALKEYMODULE_ATTR; VALKEYMODULE_API int (*ValkeyModule_SelectDb)(ValkeyModuleCtx *ctx, int newid) VALKEYMODULE_ATTR; @@ -1673,6 +1674,7 @@ static int ValkeyModule_Init(ValkeyModuleCtx *ctx, const char *name, int ver, in VALKEYMODULE_GET_API(SetModuleAttribs); VALKEYMODULE_GET_API(IsModuleNameBusy); VALKEYMODULE_GET_API(WrongArity); + VALKEYMODULE_GET_API(UpdateRuntimeArgs); VALKEYMODULE_GET_API(ReplyWithLongLong); VALKEYMODULE_GET_API(ReplyWithError); VALKEYMODULE_GET_API(ReplyWithErrorFormat); diff --git a/src/zmalloc.c b/src/zmalloc.c index e18fa8bac2..a696111e47 100644 --- a/src/zmalloc.c +++ b/src/zmalloc.c @@ -84,8 +84,6 @@ void zlibc_free(void *ptr) { #define calloc(count, size) je_calloc(count, size) #define realloc(ptr, size) je_realloc(ptr, size) #define free(ptr) je_free(ptr) -#define mallocx(size, flags) je_mallocx(size, flags) -#define dallocx(ptr, flags) je_dallocx(ptr, flags) #endif #define thread_local _Thread_local @@ -207,25 +205,6 @@ void *zmalloc_usable(size_t size, size_t *usable) { return ptr; } -/* Allocation and free functions that bypass the thread cache - * and go straight to the allocator arena bins. - * Currently implemented only for jemalloc. Used for online defragmentation. */ -#ifdef HAVE_DEFRAG -void *zmalloc_no_tcache(size_t size) { - if (size >= SIZE_MAX / 2) zmalloc_oom_handler(size); - void *ptr = mallocx(size + PREFIX_SIZE, MALLOCX_TCACHE_NONE); - if (!ptr) zmalloc_oom_handler(size); - update_zmalloc_stat_alloc(zmalloc_size(ptr)); - return ptr; -} - -void zfree_no_tcache(void *ptr) { - if (ptr == NULL) return; - update_zmalloc_stat_free(zmalloc_size(ptr)); - dallocx(ptr, MALLOCX_TCACHE_NONE); -} -#endif - /* Try allocating memory and zero it, and return NULL if failed. * '*usable' is set to the usable size if non NULL. */ static inline void *ztrycalloc_usable_internal(size_t size, size_t *usable) { @@ -683,52 +662,7 @@ size_t zmalloc_get_rss(void) { #define STRINGIFY_(x) #x #define STRINGIFY(x) STRINGIFY_(x) -/* Compute the total memory wasted in fragmentation of inside small arena bins. - * Done by summing the memory in unused regs in all slabs of all small bins. */ -size_t zmalloc_get_frag_smallbins(void) { - unsigned nbins; - size_t sz, frag = 0; - char buf[100]; - - sz = sizeof(unsigned); - assert(!je_mallctl("arenas.nbins", &nbins, &sz, NULL, 0)); - for (unsigned j = 0; j < nbins; j++) { - size_t curregs, curslabs, reg_size; - uint32_t nregs; - - /* The size of the current bin */ - snprintf(buf, sizeof(buf), "arenas.bin.%d.size", j); - sz = sizeof(size_t); - assert(!je_mallctl(buf, ®_size, &sz, NULL, 0)); - - /* Number of used regions in the bin */ - snprintf(buf, sizeof(buf), "stats.arenas." STRINGIFY(MALLCTL_ARENAS_ALL) ".bins.%d.curregs", j); - sz = sizeof(size_t); - assert(!je_mallctl(buf, &curregs, &sz, NULL, 0)); - - /* Number of regions per slab */ - snprintf(buf, sizeof(buf), "arenas.bin.%d.nregs", j); - sz = sizeof(uint32_t); - assert(!je_mallctl(buf, &nregs, &sz, NULL, 0)); - - /* Number of current slabs in the bin */ - snprintf(buf, sizeof(buf), "stats.arenas." STRINGIFY(MALLCTL_ARENAS_ALL) ".bins.%d.curslabs", j); - sz = sizeof(size_t); - assert(!je_mallctl(buf, &curslabs, &sz, NULL, 0)); - - /* Calculate the fragmentation bytes for the current bin and add it to the total. */ - frag += ((nregs * curslabs) - curregs) * reg_size; - } - - return frag; -} - -int zmalloc_get_allocator_info(size_t *allocated, - size_t *active, - size_t *resident, - size_t *retained, - size_t *muzzy, - size_t *frag_smallbins_bytes) { +int zmalloc_get_allocator_info(size_t *allocated, size_t *active, size_t *resident, size_t *retained, size_t *muzzy) { uint64_t epoch = 1; size_t sz; *allocated = *resident = *active = 0; @@ -763,8 +697,6 @@ int zmalloc_get_allocator_info(size_t *allocated, *muzzy = pmuzzy * page; } - /* Total size of consumed meomry in unused regs in small bins (AKA external fragmentation). */ - *frag_smallbins_bytes = zmalloc_get_frag_smallbins(); return 1; } @@ -789,13 +721,8 @@ int jemalloc_purge(void) { #else -int zmalloc_get_allocator_info(size_t *allocated, - size_t *active, - size_t *resident, - size_t *retained, - size_t *muzzy, - size_t *frag_smallbins_bytes) { - *allocated = *resident = *active = *frag_smallbins_bytes = 0; +int zmalloc_get_allocator_info(size_t *allocated, size_t *active, size_t *resident, size_t *retained, size_t *muzzy) { + *allocated = *resident = *active = 0; if (retained) *retained = 0; if (muzzy) *muzzy = 0; return 1; diff --git a/src/zmalloc.h b/src/zmalloc.h index 9b51f4c866..38c2bae864 100644 --- a/src/zmalloc.h +++ b/src/zmalloc.h @@ -100,13 +100,6 @@ #include #endif -/* We can enable the server defrag capabilities only if we are using Jemalloc - * and the version used is our special version modified for the server having - * the ability to return per-allocation fragmentation hints. */ -#if defined(USE_JEMALLOC) && defined(JEMALLOC_FRAG_HINT) -#define HAVE_DEFRAG -#endif - /* The zcalloc symbol is a symbol name already used by zlib, which is defining * other names using the "z" prefix specific to zlib. In practice, linking * valkey with a static openssl, which itself might depend on a static libz @@ -138,12 +131,7 @@ __attribute__((malloc)) char *zstrdup(const char *s); size_t zmalloc_used_memory(void); void zmalloc_set_oom_handler(void (*oom_handler)(size_t)); size_t zmalloc_get_rss(void); -int zmalloc_get_allocator_info(size_t *allocated, - size_t *active, - size_t *resident, - size_t *retained, - size_t *muzzy, - size_t *frag_smallbins_bytes); +int zmalloc_get_allocator_info(size_t *allocated, size_t *active, size_t *resident, size_t *retained, size_t *muzzy); void set_jemalloc_bg_thread(int enable); int jemalloc_purge(void); size_t zmalloc_get_private_dirty(long pid); @@ -153,11 +141,6 @@ void zlibc_free(void *ptr); void zlibc_trim(void); void zmadvise_dontneed(void *ptr); -#ifdef HAVE_DEFRAG -void zfree_no_tcache(void *ptr); -__attribute__((malloc)) void *zmalloc_no_tcache(size_t size); -#endif - #ifndef HAVE_MALLOC_SIZE size_t zmalloc_size(void *ptr); size_t zmalloc_usable_size(void *ptr); diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt new file mode 100644 index 0000000000..2a76897bb0 --- /dev/null +++ b/tests/CMakeLists.txt @@ -0,0 +1,5 @@ +add_subdirectory(rdma) + +if (BUILD_TEST_MODULES) + add_subdirectory(modules) +endif () diff --git a/tests/cluster/tests/28-cluster-shards.tcl b/tests/cluster/tests/28-cluster-shards.tcl index d6534c816b..5fb6743246 100644 --- a/tests/cluster/tests/28-cluster-shards.tcl +++ b/tests/cluster/tests/28-cluster-shards.tcl @@ -117,7 +117,7 @@ test "Kill a node and tell the replica to immediately takeover" { # Primary 0 node should report as fail, wait until the new primary acknowledges it. test "Verify health as fail for killed node" { - wait_for_condition 50 100 { + wait_for_condition 1000 50 { "fail" eq [dict get [get_node_info_from_shard $node_0_id 4 "node"] "health"] } else { fail "New primary never detected the node failed" diff --git a/tests/integration/dual-channel-replication.tcl b/tests/integration/dual-channel-replication.tcl index 922676fb19..bb767d784c 100644 --- a/tests/integration/dual-channel-replication.tcl +++ b/tests/integration/dual-channel-replication.tcl @@ -23,14 +23,20 @@ proc get_client_id_by_last_cmd {r cmd} { return $client_id } -# Wait until the process enters a paused state, then resume the process. -proc wait_and_resume_process idx { +# Wait until the process enters a paused state. +proc wait_process_paused idx { set pid [srv $idx pid] wait_for_condition 50 1000 { [string match "T*" [exec ps -o state= -p $pid]] } else { fail "Process $pid didn't stop, current state is [exec ps -o state= -p $pid]" } +} + +# Wait until the process enters a paused state, then resume the process. +proc wait_and_resume_process idx { + set pid [srv $idx pid] + wait_process_paused $idx resume_process $pid } @@ -479,7 +485,7 @@ start_server {tags {"dual-channel-replication external:skip"}} { } wait_for_value_to_propegate_to_replica $primary $replica "key1" # Confirm the occurrence of a race condition. - wait_for_log_messages -1 {"*Dual channel sync - psync established after rdb load*"} 0 2000 1 + wait_for_log_messages -1 {"* Psync established after rdb load*"} 0 2000 1 } } } @@ -769,7 +775,7 @@ start_server {tags {"dual-channel-replication external:skip"}} { $replica config set dual-channel-replication-enabled yes $replica config set loglevel debug - $replica config set repl-timeout 10 + $replica config set repl-timeout 60 $primary config set repl-backlog-size 1mb test "Test dual-channel-replication primary gets cob overrun before established psync" { @@ -790,16 +796,56 @@ start_server {tags {"dual-channel-replication external:skip"}} { } else { fail "Primary did not free repl buf block after sync failure" } + # Full sync will be triggered after the replica is reconnected, pause primary main process after fork. + # In this way, in the subsequent replicaof no one, we won't get the LOADING error if the replica reconnects + # too quickly and enters the loading state. + $primary debug pause-after-fork 1 resume_process $replica_pid set res [wait_for_log_messages -1 {"*Unable to partial resync with replica * for lack of backlog*"} $loglines 2000 10] set loglines [lindex $res 1] } + # Waiting for the primary to enter the paused state, that is, make sure that bgsave is triggered. + wait_process_paused -1 $replica replicaof no one + # Resume the primary and make sure the sync is dropped. + resume_process [srv -1 pid] + $primary debug pause-after-fork 0 wait_for_condition 500 1000 { [s -1 rdb_bgsave_in_progress] eq 0 } else { fail "Primary should abort sync" } + stop_write_load $load_handle0 + stop_write_load $load_handle1 + stop_write_load $load_handle2 + } +} + +start_server {tags {"dual-channel-replication external:skip"}} { + set primary [srv 0 client] + set primary_host [srv 0 host] + set primary_port [srv 0 port] + set loglines [count_log_lines 0] + + $primary config set repl-diskless-sync yes + $primary config set dual-channel-replication-enabled yes + $primary config set client-output-buffer-limit "replica 1100k 0 0" + $primary config set loglevel debug + start_server {} { + set replica [srv 0 client] + set replica_host [srv 0 host] + set replica_port [srv 0 port] + set replica_log [srv 0 stdout] + set replica_pid [srv 0 pid] + + set load_handle0 [start_write_load $primary_host $primary_port 60] + set load_handle1 [start_write_load $primary_host $primary_port 60] + set load_handle2 [start_write_load $primary_host $primary_port 60] + + $replica config set dual-channel-replication-enabled yes + $replica config set loglevel debug + $replica config set repl-timeout 60 + $primary config set repl-backlog-size 1mb $replica debug pause-after-fork 1 $primary debug populate 1000 primary 100000 diff --git a/tests/modules/CMakeLists.txt b/tests/modules/CMakeLists.txt new file mode 100644 index 0000000000..0cac0c4cb6 --- /dev/null +++ b/tests/modules/CMakeLists.txt @@ -0,0 +1,58 @@ +# Build test modules +list(APPEND MODULES_LIST "commandfilter") +list(APPEND MODULES_LIST "basics") +list(APPEND MODULES_LIST "testrdb") +list(APPEND MODULES_LIST "fork") +list(APPEND MODULES_LIST "infotest") +list(APPEND MODULES_LIST "propagate") +list(APPEND MODULES_LIST "misc") +list(APPEND MODULES_LIST "hooks") +list(APPEND MODULES_LIST "blockonkeys") +list(APPEND MODULES_LIST "blockonbackground") +list(APPEND MODULES_LIST "scan") +list(APPEND MODULES_LIST "datatype") +list(APPEND MODULES_LIST "datatype2") +list(APPEND MODULES_LIST "auth") +list(APPEND MODULES_LIST "keyspace_events") +list(APPEND MODULES_LIST "blockedclient") +list(APPEND MODULES_LIST "getkeys") +list(APPEND MODULES_LIST "getchannels") +list(APPEND MODULES_LIST "test_lazyfree") +list(APPEND MODULES_LIST "timer") +list(APPEND MODULES_LIST "defragtest") +list(APPEND MODULES_LIST "keyspecs") +list(APPEND MODULES_LIST "hash") +list(APPEND MODULES_LIST "zset") +list(APPEND MODULES_LIST "stream") +list(APPEND MODULES_LIST "mallocsize") +list(APPEND MODULES_LIST "aclcheck") +list(APPEND MODULES_LIST "list") +list(APPEND MODULES_LIST "subcommands") +list(APPEND MODULES_LIST "reply") +list(APPEND MODULES_LIST "cmdintrospection") +list(APPEND MODULES_LIST "eventloop") +list(APPEND MODULES_LIST "moduleconfigs") +list(APPEND MODULES_LIST "moduleconfigstwo") +list(APPEND MODULES_LIST "publish") +list(APPEND MODULES_LIST "usercall") +list(APPEND MODULES_LIST "postnotifications") +list(APPEND MODULES_LIST "moduleauthtwo") +list(APPEND MODULES_LIST "rdbloadsave") +list(APPEND MODULES_LIST "crash") +list(APPEND MODULES_LIST "cluster") + +foreach (MODULE_NAME ${MODULES_LIST}) + message(STATUS "Building test module: ${MODULE_NAME}") + add_library(${MODULE_NAME} SHARED "${CMAKE_SOURCE_DIR}/tests/modules/${MODULE_NAME}.c") + target_include_directories(${MODULE_NAME} PRIVATE "${CMAKE_SOURCE_DIR}/src") + if (LINUX AND NOT APPLE) + # set the std to gnu11 here, to allow crash.c to get compiled + target_compile_options(${MODULE_NAME} PRIVATE "-std=gnu11") + endif () + set_target_properties(${MODULE_NAME} PROPERTIES PREFIX "") + valkey_install_bin(${MODULE_NAME}) + if (APPLE) + # Some symbols can only be resolved during runtime (they exist in the executable) + target_link_options(${MODULE_NAME} PRIVATE -undefined dynamic_lookup) + endif () +endforeach () diff --git a/tests/modules/Makefile b/tests/modules/Makefile index 1690b9b627..82813bb6f7 100644 --- a/tests/modules/Makefile +++ b/tests/modules/Makefile @@ -58,6 +58,7 @@ TEST_MODULES = \ eventloop.so \ moduleconfigs.so \ moduleconfigstwo.so \ + moduleparameter.so \ publish.so \ usercall.so \ postnotifications.so \ diff --git a/tests/modules/moduleparameter.c b/tests/modules/moduleparameter.c new file mode 100644 index 0000000000..6c110f2cfb --- /dev/null +++ b/tests/modules/moduleparameter.c @@ -0,0 +1,28 @@ +#include "valkeymodule.h" +#include +#include +#include +#include + +int test_module_update_parameter(ValkeyModuleCtx *ctx, + ValkeyModuleString **argv, int argc) { + + ValkeyModule_UpdateRuntimeArgs(ctx, argv, argc); + return ValkeyModule_ReplyWithSimpleString(ctx, "OK"); +} + +int ValkeyModule_OnLoad(ValkeyModuleCtx *ctx, ValkeyModuleString **argv, int argc) { + VALKEYMODULE_NOT_USED(argv); + VALKEYMODULE_NOT_USED(argc); + + if (ValkeyModule_Init(ctx, "moduleparameter", 1, VALKEYMODULE_APIVER_1) == + VALKEYMODULE_ERR) + return VALKEYMODULE_ERR; + + if (ValkeyModule_CreateCommand(ctx, "testmoduleparameter.update.parameter", + test_module_update_parameter, "fast", 0, 0, + 0) == VALKEYMODULE_ERR) + return VALKEYMODULE_ERR; + + return VALKEYMODULE_OK; +} diff --git a/tests/rdma/CMakeLists.txt b/tests/rdma/CMakeLists.txt new file mode 100644 index 0000000000..f721e9af52 --- /dev/null +++ b/tests/rdma/CMakeLists.txt @@ -0,0 +1,9 @@ +project(rdma-test) + +# Make sure RDMA build is enabled +if (BUILD_RDMA_MODULE) + add_executable(rdma-test "${CMAKE_SOURCE_DIR}/tests/rdma/rdma-test.c") + target_link_libraries(rdma-test "${RDMA_LIBS}") + target_link_options(rdma-test PRIVATE "-pthread") + valkey_install_bin(rdma-test) +endif () diff --git a/tests/rdma/run.py b/tests/rdma/run.py index 0724c27adc..77e0f285fe 100755 --- a/tests/rdma/run.py +++ b/tests/rdma/run.py @@ -60,10 +60,9 @@ def test_rdma(ipaddr): # step 2, start server svrpath = valkeydir + "/src/valkey-server" - rdmapath = valkeydir + "/src/valkey-rdma.so" svrcmd = [svrpath, "--port", "0", "--loglevel", "verbose", "--protected-mode", "yes", "--appendonly", "no", "--daemonize", "no", "--dir", valkeydir + "/tests/rdma/tmp", - "--loadmodule", rdmapath, "port=6379", "bind=" + ipaddr] + "--rdma-port", "6379", "--rdma-bind", ipaddr] svr = subprocess.Popen(svrcmd, shell=False, stdout=subprocess.PIPE) try: diff --git a/tests/support/cluster_util.tcl b/tests/support/cluster_util.tcl index dd5cd84df2..686f00071b 100644 --- a/tests/support/cluster_util.tcl +++ b/tests/support/cluster_util.tcl @@ -145,6 +145,7 @@ proc wait_for_cluster_size {cluster_size} { # Check that cluster nodes agree about "state", or raise an error. proc wait_for_cluster_state {state} { for {set j 0} {$j < [llength $::servers]} {incr j} { + if {[process_is_paused [srv -$j pid]]} continue wait_for_condition 1000 50 { [CI $j cluster_state] eq $state } else { @@ -277,6 +278,14 @@ proc cluster_get_myself id { return {} } +# Returns the parsed "myself's primary" CLUSTER NODES entry as a dictionary. +proc cluster_get_myself_primary id { + set myself [cluster_get_myself $id] + set replicaof [dict get $myself slaveof] + set node [cluster_get_node_by_id $id $replicaof] + return $node +} + # Get a specific node by ID by parsing the CLUSTER NODES output # of the instance Number 'instance_id' proc cluster_get_node_by_id {instance_id node_id} { diff --git a/tests/test_helper.tcl b/tests/test_helper.tcl index 7c15413806..1f0658071a 100644 --- a/tests/test_helper.tcl +++ b/tests/test_helper.tcl @@ -35,12 +35,12 @@ foreach test_dir $test_dirs { set cluster_test_dir unit/cluster foreach file [glob -nocomplain $dir/tests/$cluster_test_dir/*.tcl] { - lappend ::cluster_all_tests $cluster_test_dir/[file root [file tail $file]] + lappend ::cluster_all_tests $cluster_test_dir/[file root [file tail $file]] } set moduleapi_test_dir unit/moduleapi foreach file [glob -nocomplain $dir/tests/$moduleapi_test_dir/*.tcl] { - lappend ::module_api_all_tests $moduleapi_test_dir/[file root [file tail $file]] + lappend ::module_api_all_tests $moduleapi_test_dir/[file root [file tail $file]] } # Index to the next test to run in the ::all_tests list. @@ -654,7 +654,7 @@ for {set j 0} {$j < [llength $argv]} {incr j} { } } elseif {$opt eq {--quiet}} { set ::quiet 1 - } elseif {$opt eq {--io-threads}} { + } elseif {$opt eq {--io-threads}} { set ::io_threads 1 } elseif {$opt eq {--tls} || $opt eq {--tls-module}} { package require tls 1.6 diff --git a/tests/unit/cluster/cluster-shards.tcl b/tests/unit/cluster/cluster-shards.tcl index 19acd186f5..170114d822 100644 --- a/tests/unit/cluster/cluster-shards.tcl +++ b/tests/unit/cluster/cluster-shards.tcl @@ -42,7 +42,7 @@ start_cluster 3 3 {tags {external:skip cluster}} { } test "Verify health as fail for killed node" { - wait_for_condition 50 100 { + wait_for_condition 1000 50 { "fail" eq [dict get [get_node_info_from_shard $node_0_id $validation_node "node"] "health"] } else { fail "New primary never detected the node failed" diff --git a/tests/unit/cluster/failover2.tcl b/tests/unit/cluster/failover2.tcl index 7bc6a05e95..9262049e4e 100644 --- a/tests/unit/cluster/failover2.tcl +++ b/tests/unit/cluster/failover2.tcl @@ -64,3 +64,41 @@ start_cluster 3 4 {tags {external:skip cluster} overrides {cluster-ping-interval } } ;# start_cluster + + +start_cluster 7 3 {tags {external:skip cluster} overrides {cluster-ping-interval 1000 cluster-node-timeout 5000}} { + test "Primaries will not time out then they are elected in the same epoch" { + # Since we have the delay time, so these node may not initiate the + # election at the same time (same epoch). But if they do, we make + # sure there is no failover timeout. + + # Killing there primary nodes. + pause_process [srv 0 pid] + pause_process [srv -1 pid] + pause_process [srv -2 pid] + + # Wait for the failover + wait_for_condition 1000 50 { + [s -7 role] == "master" && + [s -8 role] == "master" && + [s -9 role] == "master" + } else { + fail "No failover detected" + } + + # Make sure there is no false epoch 0. + verify_no_log_message -7 "*Failover election in progress for epoch 0*" 0 + verify_no_log_message -8 "*Failover election in progress for epoch 0*" 0 + verify_no_log_message -9 "*Failover election in progress for epoch 0*" 0 + + # Make sure there is no failover timeout. + verify_no_log_message -7 "*Failover attempt expired*" 0 + verify_no_log_message -8 "*Failover attempt expired*" 0 + verify_no_log_message -9 "*Failover attempt expired*" 0 + + # Resuming these primary nodes, speed up the shutdown. + resume_process [srv 0 pid] + resume_process [srv -1 pid] + resume_process [srv -2 pid] + } +} ;# start_cluster diff --git a/tests/unit/cluster/info.tcl b/tests/unit/cluster/info.tcl index 0d7b249899..f882378172 100644 --- a/tests/unit/cluster/info.tcl +++ b/tests/unit/cluster/info.tcl @@ -41,3 +41,26 @@ test "errorstats: rejected call due to MOVED Redirection" { } } ;# start_cluster + +start_cluster 3 0 {tags {external:skip cluster} overrides {cluster-node-timeout 1000}} { + test "fail reason changed" { + # Kill one primary, so the cluster fail with not-full-coverage. + pause_process [srv 0 pid] + wait_for_condition 1000 50 { + [CI 1 cluster_state] eq {fail} && + [CI 2 cluster_state] eq {fail} + } else { + fail "Cluster doesn't fail" + } + verify_log_message -1 "*At least one hash slot is not served by any available node*" 0 + verify_log_message -2 "*At least one hash slot is not served by any available node*" 0 + + # Kill one more primary, so the cluster fail with minority-partition. + pause_process [srv -1 pid] + wait_for_log_messages -2 {"*minority partition*"} 0 1000 50 + + resume_process [srv 0 pid] + resume_process [srv -1 pid] + wait_for_cluster_state ok + } +} diff --git a/tests/unit/cluster/manual-failover.tcl b/tests/unit/cluster/manual-failover.tcl index 2a9dff934b..220ffc3eaf 100644 --- a/tests/unit/cluster/manual-failover.tcl +++ b/tests/unit/cluster/manual-failover.tcl @@ -183,3 +183,194 @@ test "Wait for instance #0 to return back alive" { } } ;# start_cluster + +start_cluster 3 1 {tags {external:skip cluster} overrides {cluster-ping-interval 1000 cluster-node-timeout 2000}} { + test "Manual failover vote is not limited by two times the node timeout - drop the auth ack" { + set CLUSTER_PACKET_TYPE_FAILOVER_AUTH_ACK 6 + set CLUSTER_PACKET_TYPE_NONE -1 + + # Setting a large timeout to make sure we hit the voted_time limit. + R 0 config set cluster-node-timeout 150000 + R 1 config set cluster-node-timeout 150000 + R 2 config set cluster-node-timeout 150000 + + # Let replica drop FAILOVER_AUTH_ACK so that the election won't + # get the enough votes and the election will time out. + R 3 debug drop-cluster-packet-filter $CLUSTER_PACKET_TYPE_FAILOVER_AUTH_ACK + + # The first manual failover will time out. + R 3 cluster failover + wait_for_log_messages 0 {"*Manual failover timed out*"} 0 1000 50 + wait_for_log_messages -3 {"*Manual failover timed out*"} 0 1000 50 + + # Undo packet drop, so that replica can win the next election. + R 3 debug drop-cluster-packet-filter $CLUSTER_PACKET_TYPE_NONE + + # Make sure the second manual failover will work. + R 3 cluster failover + wait_for_condition 1000 50 { + [s 0 role] eq {slave} && + [s -3 role] eq {master} + } else { + fail "The second failover does not happen" + } + wait_for_cluster_propagation + } +} ;# start_cluster + +start_cluster 3 1 {tags {external:skip cluster} overrides {cluster-ping-interval 1000 cluster-node-timeout 2000}} { + test "Manual failover vote is not limited by two times the node timeout - mixed failover" { + # Make sure the failover is triggered by us. + R 1 config set cluster-replica-validity-factor 0 + R 3 config set cluster-replica-no-failover yes + R 3 config set cluster-replica-validity-factor 0 + + # Pause the primary. + pause_process [srv 0 pid] + wait_for_cluster_state fail + + # Setting a large timeout to make sure we hit the voted_time limit. + R 1 config set cluster-node-timeout 150000 + R 2 config set cluster-node-timeout 150000 + + # R 3 performs an automatic failover and it will work. + R 3 config set cluster-replica-no-failover no + wait_for_condition 1000 50 { + [s -3 role] eq {master} + } else { + fail "The first failover does not happen" + } + + # Resume the primary and wait for it to become a replica. + resume_process [srv 0 pid] + wait_for_condition 1000 50 { + [s 0 role] eq {slave} + } else { + fail "Old primary not converted into replica" + } + wait_for_cluster_propagation + + # The old primary doing a manual failover and wait for it. + R 0 cluster failover + wait_for_condition 1000 50 { + [s 0 role] eq {master} && + [s -3 role] eq {slave} + } else { + fail "The second failover does not happen" + } + wait_for_cluster_propagation + + # R 3 performs a manual failover and it will work. + R 3 cluster failover + wait_for_condition 1000 50 { + [s 0 role] eq {slave} && + [s -3 role] eq {master} + } else { + fail "The third falover does not happen" + } + wait_for_cluster_propagation + } +} ;# start_cluster + +start_cluster 3 1 {tags {external:skip cluster} overrides {cluster-ping-interval 1000 cluster-node-timeout 15000}} { + test "Manual failover will reset the on-going election" { + set CLUSTER_PACKET_TYPE_FAILOVER_AUTH_REQUEST 5 + set CLUSTER_PACKET_TYPE_NONE -1 + + # Let other primaries drop FAILOVER_AUTH_REQUEST so that the election won't + # get the enough votes and the election will time out. + R 1 debug drop-cluster-packet-filter $CLUSTER_PACKET_TYPE_FAILOVER_AUTH_REQUEST + R 2 debug drop-cluster-packet-filter $CLUSTER_PACKET_TYPE_FAILOVER_AUTH_REQUEST + + # Replica doing the manual failover. + R 3 cluster failover + + # Waiting for primary and replica to confirm manual failover timeout. + wait_for_log_messages 0 {"*Manual failover timed out*"} 0 1000 50 + wait_for_log_messages -3 {"*Manual failover timed out*"} 0 1000 50 + set loglines1 [count_log_lines 0] + set loglines2 [count_log_lines -3] + + # Undo packet drop, so that replica can win the next election. + R 1 debug drop-cluster-packet-filter $CLUSTER_PACKET_TYPE_NONE + R 2 debug drop-cluster-packet-filter $CLUSTER_PACKET_TYPE_NONE + + # Replica doing the manual failover again. + R 3 cluster failover + + # Make sure the election is reset. + wait_for_log_messages -3 {"*Failover election in progress*Resetting the election*"} $loglines2 1000 50 + + # Wait for failover. + wait_for_condition 1000 50 { + [s -3 role] == "master" + } else { + fail "No failover detected" + } + + # Make sure that the second manual failover does not time out. + verify_no_log_message 0 "*Manual failover timed out*" $loglines1 + verify_no_log_message -3 "*Manual failover timed out*" $loglines2 + } +} ;# start_cluster + +start_cluster 3 1 {tags {external:skip cluster} overrides {cluster-ping-interval 1000 cluster-node-timeout 1000}} { + test "Broadcast PONG to the cluster when the node role changes" { + # R0 is a primary and R3 is a replica, we will do multiple cluster failover + # and then check their role and flags. + set R0_nodeid [R 0 cluster myid] + set R3_nodeid [R 3 cluster myid] + + # Make sure we don't send PINGs for a short period of time. + for {set j 0} {$j < [llength $::servers]} {incr j} { + R $j debug disable-cluster-random-ping 0 + R $j config set cluster-ping-interval 300000 + } + + R 3 cluster failover + wait_for_condition 1000 50 { + [s 0 role] eq {slave} && + [s -3 role] eq {master} + } else { + fail "Failover does not happened" + } + + # Get the node information of R0 and R3 in my view from CLUSTER NODES + # R0 should be a replica and R3 should be a primary in all views. + for {set j 0} {$j < [llength $::servers]} {incr j} { + wait_for_condition 1000 50 { + [check_cluster_node_mark slave $j $R0_nodeid] && + [check_cluster_node_mark master $j $R3_nodeid] + } else { + puts "R0_nodeid: $R0_nodeid" + puts "R3_nodeid: $R3_nodeid" + puts "R $j cluster nodes:" + puts [R $j cluster nodes] + fail "Node role does not changed in the first failover" + } + } + + R 0 cluster failover + wait_for_condition 1000 50 { + [s 0 role] eq {master} && + [s -3 role] eq {slave} + } else { + fail "The second failover does not happened" + } + + # Get the node information of R0 and R3 in my view from CLUSTER NODES + # R0 should be a primary and R3 should be a replica in all views. + for {set j 0} {$j < [llength $::servers]} {incr j} { + wait_for_condition 1000 50 { + [check_cluster_node_mark master $j $R0_nodeid] && + [check_cluster_node_mark slave $j $R3_nodeid] + } else { + puts "R0_nodeid: $R0_nodeid" + puts "R3_nodeid: $R3_nodeid" + puts "R $j cluster nodes:" + puts [R $j cluster nodes] + fail "Node role does not changed in the second failover" + } + } + } +} ;# start_cluster diff --git a/tests/unit/cluster/replica-migration.tcl b/tests/unit/cluster/replica-migration.tcl index 05d6528684..591d732fce 100644 --- a/tests/unit/cluster/replica-migration.tcl +++ b/tests/unit/cluster/replica-migration.tcl @@ -90,6 +90,8 @@ proc test_migrated_replica {type} { # Wait for the cluster to be ok. wait_for_condition 1000 50 { + [R 3 cluster slots] eq [R 4 cluster slots] && + [R 4 cluster slots] eq [R 7 cluster slots] && [CI 3 cluster_state] eq "ok" && [CI 4 cluster_state] eq "ok" && [CI 7 cluster_state] eq "ok" @@ -187,6 +189,7 @@ proc test_nonempty_replica {type} { # Wait for the cluster to be ok. wait_for_condition 1000 50 { + [R 4 cluster slots] eq [R 7 cluster slots] && [CI 4 cluster_state] eq "ok" && [CI 7 cluster_state] eq "ok" } else { @@ -306,6 +309,8 @@ proc test_sub_replica {type} { # Wait for the cluster to be ok. wait_for_condition 1000 50 { + [R 3 cluster slots] eq [R 4 cluster slots] && + [R 4 cluster slots] eq [R 7 cluster slots] && [CI 3 cluster_state] eq "ok" && [CI 4 cluster_state] eq "ok" && [CI 7 cluster_state] eq "ok" @@ -400,3 +405,23 @@ start_cluster 4 4 {tags {external:skip cluster} overrides {cluster-node-timeout start_cluster 4 4 {tags {external:skip cluster} overrides {cluster-node-timeout 1000 cluster-migration-barrier 999}} { test_cluster_setslot "setslot" } my_slot_allocation cluster_allocate_replicas ;# start_cluster + +start_cluster 3 0 {tags {external:skip cluster} overrides {cluster-node-timeout 1000 cluster-migration-barrier 999}} { + test "Empty primary will check and delete the dirty slots" { + R 2 config set cluster-allow-replica-migration no + + # Write a key to slot 0. + R 2 incr key_977613 + + # Move slot 0 from primary 2 to primary 0. + R 0 cluster bumpepoch + R 0 cluster setslot 0 node [R 0 cluster myid] + + # Wait for R 2 to report that it is an empty primary (cluster-allow-replica-migration no) + wait_for_log_messages -2 {"*I am now an empty primary*"} 0 1000 50 + + # Make sure primary 0 will delete the dirty slots. + verify_log_message -2 "*Deleting keys in dirty slot 0*" 0 + assert_equal [R 2 dbsize] 0 + } +} my_slot_allocation cluster_allocate_replicas ;# start_cluster diff --git a/tests/unit/cluster/slot-migration.tcl b/tests/unit/cluster/slot-migration.tcl index d798971968..289c20578d 100644 --- a/tests/unit/cluster/slot-migration.tcl +++ b/tests/unit/cluster/slot-migration.tcl @@ -14,17 +14,61 @@ proc get_cluster_role {srv_idx} { return $role } +proc get_myself_primary_flags {srv_idx} { + set flags [dict get [cluster_get_myself_primary $srv_idx] flags] + return $flags +} + +proc get_myself_primary_linkstate {srv_idx} { + set linkstate [dict get [cluster_get_myself_primary $srv_idx] linkstate] + return $linkstate +} + proc wait_for_role {srv_idx role} { + # Wait for the role, make sure the replication role matches. wait_for_condition 100 100 { [lindex [split [R $srv_idx ROLE] " "] 0] eq $role } else { + puts "R $srv_idx ROLE: [R $srv_idx ROLE]" fail "R $srv_idx didn't assume the replication $role in time" } + + if {$role eq "slave"} { + # Wait for the replication link, make sure the replication link is normal. + wait_for_condition 100 100 { + [s -$srv_idx master_link_status] eq "up" + } else { + puts "R $srv_idx INFO REPLICATION: [R $srv_idx INFO REPLICATION]" + fail "R $srv_idx didn't assume the replication link in time" + } + } + + # Wait for the cluster role, make sure the cluster role matches. wait_for_condition 100 100 { [get_cluster_role $srv_idx] eq $role } else { + puts "R $srv_idx CLUSTER NODES: [R $srv_idx CLUSTER NODES]" fail "R $srv_idx didn't assume the cluster $role in time" } + + if {$role eq "slave"} { + # Wait for the flags, make sure the primary node is not failed. + wait_for_condition 100 100 { + [get_myself_primary_flags $srv_idx] eq "master" + } else { + puts "R $srv_idx CLUSTER NODES: [R $srv_idx CLUSTER NODES]" + fail "R $srv_idx didn't assume the primary state in time" + } + + # Wait for the cluster link, make sure that the cluster connection is normal. + wait_for_condition 100 100 { + [get_myself_primary_linkstate $srv_idx] eq "connected" + } else { + puts "R $srv_idx CLUSTER NODES: [R $srv_idx CLUSTER NODES]" + fail "R $srv_idx didn't assume the cluster primary link in time" + } + } + wait_for_cluster_propagation } diff --git a/tests/unit/cluster/slot-ownership.tcl b/tests/unit/cluster/slot-ownership.tcl index 0f3e3cc4f7..0073c2904f 100644 --- a/tests/unit/cluster/slot-ownership.tcl +++ b/tests/unit/cluster/slot-ownership.tcl @@ -59,3 +59,88 @@ start_cluster 2 2 {tags {external:skip cluster}} { } } } + +start_cluster 3 1 {tags {external:skip cluster} overrides {shutdown-timeout 100}} { + test "Primary lost a slot during the shutdown waiting" { + R 0 set FOO 0 + + # Pause the replica. + pause_process [srv -3 pid] + + # Incr the key and immediately shutdown the primary. + # The primary waits for the replica to replicate before exiting. + R 0 incr FOO + exec kill -SIGTERM [srv 0 pid] + wait_for_condition 50 100 { + [s 0 shutdown_in_milliseconds] > 0 + } else { + fail "Primary not indicating ongoing shutdown." + } + + # Move the slot to other primary + R 1 cluster bumpepoch + R 1 cluster setslot [R 1 cluster keyslot FOO] node [R 1 cluster myid] + + # Waiting for dirty slot update. + wait_for_log_messages 0 {"*Deleting keys in dirty slot*"} 0 1000 10 + + # Resume the replica and make sure primary exits normally instead of crashing. + resume_process [srv -3 pid] + wait_for_log_messages 0 {"*Valkey is now ready to exit, bye bye*"} 0 1000 10 + + # Make sure that the replica will become the new primary and does not own the key. + wait_for_condition 1000 50 { + [s -3 role] eq {master} + } else { + fail "The replica was not converted into primary" + } + assert_error {ERR no such key} {R 3 debug object foo} + } +} + +start_cluster 3 1 {tags {external:skip cluster}} { + test "Primary lost a slot during the manual failover pausing" { + R 0 set FOO 0 + + # Set primaries to drop the FAILOVER_AUTH_REQUEST packets, so that + # primary 0 will pause until the failover times out. + R 1 debug drop-cluster-packet-filter 5 + R 2 debug drop-cluster-packet-filter 5 + + # Replica doing the manual failover. + R 3 cluster failover + + # Move the slot to other primary + R 1 cluster bumpepoch + R 1 cluster setslot [R 1 cluster keyslot FOO] node [R 1 cluster myid] + + # Waiting for dirty slot update. + wait_for_log_messages 0 {"*Deleting keys in dirty slot*"} 0 1000 10 + + # Make sure primary doesn't crash when deleting the keys. + R 0 ping + + R 1 debug drop-cluster-packet-filter -1 + R 2 debug drop-cluster-packet-filter -1 + } +} + +start_cluster 3 1 {tags {external:skip cluster}} { + test "Primary lost a slot during the client pause command" { + R 0 set FOO 0 + + R 0 client pause 1000000000 write + + # Move the slot to other primary + R 1 cluster bumpepoch + R 1 cluster setslot [R 1 cluster keyslot FOO] node [R 1 cluster myid] + + # Waiting for dirty slot update. + wait_for_log_messages 0 {"*Deleting keys in dirty slot*"} 0 1000 10 + + # Make sure primary doesn't crash when deleting the keys. + R 0 ping + + R 0 client unpause + } +} diff --git a/tests/unit/expire.tcl b/tests/unit/expire.tcl index d85ce7ee68..941acfad38 100644 --- a/tests/unit/expire.tcl +++ b/tests/unit/expire.tcl @@ -832,6 +832,85 @@ start_server {tags {"expire"}} { close_replication_stream $repl assert_equal [r debug set-active-expire 1] {OK} } {} {needs:debug} + + test {Import mode should forbid active expiration} { + r flushall + + r config set import-mode yes + assert_equal [r client import-source on] {OK} + + r set foo1 bar PX 1 + r set foo2 bar PX 1 + after 10 + + assert_equal [r dbsize] {2} + + assert_equal [r client import-source off] {OK} + r config set import-mode no + + # Verify all keys have expired + wait_for_condition 40 100 { + [r dbsize] eq 0 + } else { + fail "Keys did not actively expire." + } + } + + test {Import mode should forbid lazy expiration} { + r flushall + r debug set-active-expire 0 + + r config set import-mode yes + assert_equal [r client import-source on] {OK} + + r set foo1 1 PX 1 + after 10 + + r get foo1 + assert_equal [r dbsize] {1} + + assert_equal [r client import-source off] {OK} + r config set import-mode no + + r get foo1 + + assert_equal [r dbsize] {0} + + assert_equal [r debug set-active-expire 1] {OK} + } {} {needs:debug} + + test {Client can visit expired key in import-source state} { + r flushall + + r config set import-mode yes + + r set foo1 1 PX 1 + after 10 + + # Normal clients cannot visit expired key. + assert_equal [r get foo1] {} + assert_equal [r ttl foo1] {-2} + assert_equal [r dbsize] 1 + + # Client can visit expired key when in import-source state. + assert_equal [r client import-source on] {OK} + assert_equal [r ttl foo1] {0} + assert_equal [r get foo1] {1} + assert_equal [r incr foo1] {2} + assert_equal [r randomkey] {foo1} + assert_equal [r scan 0 match * count 10000] {0 foo1} + assert_equal [r keys *] {foo1} + + assert_equal [r client import-source off] {OK} + r config set import-mode no + + # Verify all keys have expired + wait_for_condition 40 100 { + [r dbsize] eq 0 + } else { + fail "Keys did not actively expire." + } + } } start_cluster 1 0 {tags {"expire external:skip cluster"}} { diff --git a/tests/unit/hyperloglog.tcl b/tests/unit/hyperloglog.tcl index c1b3b3a79f..765d5e0bdd 100644 --- a/tests/unit/hyperloglog.tcl +++ b/tests/unit/hyperloglog.tcl @@ -222,6 +222,46 @@ start_server {tags {"hll"}} { assert_equal 3 [r pfcount destkey] } + test {PFMERGE results with simd} { + r del hllscalar{t} hllsimd{t} hll1{t} hll2{t} hll3{t} + for {set x 1} {$x < 2000} {incr x} { + r pfadd hll1{t} [expr rand()] + } + for {set x 1} {$x < 4000} {incr x} { + r pfadd hll2{t} [expr rand()] + } + for {set x 1} {$x < 8000} {incr x} { + r pfadd hll3{t} [expr rand()] + } + assert {[r pfcount hll1{t}] > 0} + assert {[r pfcount hll2{t}] > 0} + assert {[r pfcount hll3{t}] > 0} + + r pfdebug simd off + set scalar [r pfcount hll1{t} hll2{t} hll3{t}] + r pfdebug simd on + set simd [r pfcount hll1{t} hll2{t} hll3{t}] + assert {$scalar > 0} + assert {$simd > 0} + assert_equal $scalar $simd + + r pfdebug simd off + r pfmerge hllscalar{t} hll1{t} hll2{t} hll3{t} + r pfdebug simd on + r pfmerge hllsimd{t} hll1{t} hll2{t} hll3{t} + + set scalar [r pfcount hllscalar{t}] + set simd [r pfcount hllsimd{t}] + assert {$scalar > 0} + assert {$simd > 0} + assert_equal $scalar $simd + + set scalar [r get hllscalar{t}] + set simd [r get hllsimd{t}] + assert_equal $scalar $simd + + } {} {needs:pfdebug} + test {PFCOUNT multiple-keys merge returns cardinality of union #1} { r del hll1{t} hll2{t} hll3{t} for {set x 1} {$x < 10000} {incr x} { diff --git a/tests/unit/info.tcl b/tests/unit/info.tcl index 61d1acd1f8..278a1d8e33 100644 --- a/tests/unit/info.tcl +++ b/tests/unit/info.tcl @@ -424,8 +424,7 @@ start_server {tags {"info" "external:skip"}} { set info [r info clients] assert_equal [getInfoProperty $info pubsub_clients] {1} # non-pubsub clients should not be involved - catch {unsubscribe $rd2 {non-exist-chan}} e - assert_match {*NOSUB*} $e + assert_equal {0} [unsubscribe $rd2 {non-exist-chan}] set info [r info clients] assert_equal [getInfoProperty $info pubsub_clients] {1} # close all clients diff --git a/tests/unit/introspection.tcl b/tests/unit/introspection.tcl index 352f5f183e..bafc46d4b7 100644 --- a/tests/unit/introspection.tcl +++ b/tests/unit/introspection.tcl @@ -376,6 +376,32 @@ start_server {tags {"introspection"}} { $rd close } + # This test verifies that MONITOR correctly records overwritten commands + # when executed within a MULTI-EXEC block. Specifically, it checks that even if + # the original SET-EX command arguments are overwritten for replica propagation, the MONITOR output + # still shows the original command. + test {MONITOR correctly records SET EX in MULTI-EXEC} { + # Start monitoring client + set rd [valkey_deferring_client] + $rd monitor + $rd read ; # Discard the OK + + # Execute multi-exec block with SET EX commands + r multi + r set "{slot}key1" value1 ex 3600 + r set "{slot}key2" value2 ex 1800 + r exec + + # Verify monitor output shows the original commands: + assert_match {*"multi"*} [$rd read] + assert_match {*"set"*"{slot}key1"*"value1"*"ex"*"3600"*} [$rd read] + assert_match {*"set"*"{slot}key2"*"value2"*"ex"*"1800"*} [$rd read] + assert_match {*"exec"*} [$rd read] + + # Clean up monitoring client + $rd close + } + test {MONITOR log blocked command only once} { # need to reconnect in order to reset the clients state reconnect @@ -558,6 +584,10 @@ start_server {tags {"introspection"}} { req-res-logfile client-default-resp dual-channel-replication-enabled + rdma-completion-vector + rdma-rx-size + rdma-bind + rdma-port } if {!$::tls} { @@ -950,6 +980,13 @@ start_server {tags {"introspection"}} { } } {} {external:skip} + test {valkey-server command line arguments - dir multiple times} { + start_server {config "default.conf" args {--dir "./" --dir "./"}} { + r config get dir + assert_equal {PONG} [r ping] + } + } {} {external:skip} + # Config file at this point is at a weird state, and includes all # known keywords. Might be a good idea to avoid adding tests here. } @@ -1005,6 +1042,49 @@ test {config during loading} { } } {} {external:skip} +test {MEMORY commands during loading} { + start_server [list overrides [list key-load-delay 50 loading-process-events-interval-bytes 1024]] { + # Set up some initial data + r debug populate 100000 key 1000 + + # Save and restart + r save + restart_server 0 false false + + # At this point, keys are loaded one at time, busy looping 50usec + # between each. Further, other events are processed every 1024 bytes + # of RDB. We're sending all our commands deferred, so they have a + # chance to be processed all at once between loading two keys. + + set rd [valkey_deferring_client] + + # Allowed during loading + $rd memory help + $rd memory malloc-stats + $rd memory purge + + # Disallowed during loading (because directly dependent on the dataset) + $rd memory doctor + $rd memory stats + $rd memory usage key:1 + + # memory help + assert_match {{MEMORY *}} [$rd read] + # memory malloc-stats + assert_match {*alloc*} [$rd read] + # memory purge + assert_match OK [$rd read] + # memory doctor + assert_error {*LOADING*} {$rd read} + # memory stats + assert_error {*LOADING*} {$rd read} + # memory usage key:1 + assert_error {*LOADING*} {$rd read} + + $rd close + } +} {} {external:skip} + test {CONFIG REWRITE handles rename-command properly} { start_server {tags {"introspection"} overrides {rename-command {flushdb badger}}} { assert_error {ERR unknown command*} {r flushdb} diff --git a/tests/unit/keyspace.tcl b/tests/unit/keyspace.tcl index ba55c1b8ea..1936f5e217 100644 --- a/tests/unit/keyspace.tcl +++ b/tests/unit/keyspace.tcl @@ -47,6 +47,10 @@ start_server {tags {"keyspace"}} { r dbsize } {0} + test {KEYS with empty DB} { + assert_equal {} [r keys *] + } + test "DEL against expired key" { r debug set-active-expire 0 r setex keyExpire 1 valExpire @@ -554,3 +558,14 @@ foreach {type large} [array get largevalue] { r KEYS [string repeat "*?" 50000] } {} } + +start_cluster 1 0 {tags {"keyspace external:skip cluster"}} { + test {KEYS with empty DB in cluster mode} { + assert_equal {} [r keys *] + assert_equal {} [r keys foo*] + } + + test {KEYS with empty slot in cluster mode} { + assert_equal {} [r keys foo] + } +} diff --git a/tests/unit/maxmemory.tcl b/tests/unit/maxmemory.tcl index d4e62246f1..89e9699a3e 100644 --- a/tests/unit/maxmemory.tcl +++ b/tests/unit/maxmemory.tcl @@ -611,3 +611,21 @@ start_server {tags {"maxmemory" "external:skip"}} { assert {[r object freq foo] == 5} } } + +start_server {tags {"maxmemory" "external:skip"}} { + test {Import mode should forbid eviction} { + r set key val + r config set import-mode yes + assert_equal [r client import-source on] {OK} + r config set maxmemory-policy allkeys-lru + r config set maxmemory 1 + + assert_equal [r dbsize] {1} + assert_error {OOM command not allowed*} {r set key1 val1} + + assert_equal [r client import-source off] {OK} + r config set import-mode no + + assert_equal [r dbsize] {0} + } +} \ No newline at end of file diff --git a/tests/unit/memefficiency.tcl b/tests/unit/memefficiency.tcl index d5a6a6efe2..abd23b1d83 100644 --- a/tests/unit/memefficiency.tcl +++ b/tests/unit/memefficiency.tcl @@ -40,7 +40,6 @@ run_solo {defrag} { proc test_active_defrag {type} { if {[string match {*jemalloc*} [s mem_allocator]] && [r debug mallctl arenas.page] <= 8192} { test "Active defrag main dictionary: $type" { - r config set hz 100 r config set activedefrag no r config set active-defrag-threshold-lower 5 r config set active-defrag-cycle-min 65 @@ -89,6 +88,8 @@ run_solo {defrag} { r config set active-defrag-cycle-min 65 r config set active-defrag-cycle-max 75 + after 1000 ;# Give defrag time to work (might be multiple cycles) + # Wait for the active defrag to stop working. wait_for_condition 2000 100 { [s active_defrag_running] eq 0 @@ -138,12 +139,13 @@ run_solo {defrag} { r config resetstat r config set key-load-delay -25 ;# sleep on average 1/25 usec r debug loadaof + after 1000 ;# give defrag a chance to work before turning it off r config set activedefrag no + # measure hits and misses right after aof loading set misses [s active_defrag_misses] set hits [s active_defrag_hits] - after 120 ;# serverCron only updates the info once in 100ms set frag [s allocator_frag_ratio] set max_latency 0 foreach event [r latency latest] { @@ -181,7 +183,6 @@ run_solo {defrag} { r flushdb sync r script flush sync r config resetstat - r config set hz 100 r config set activedefrag no r config set active-defrag-threshold-lower 5 r config set active-defrag-cycle-min 65 @@ -203,7 +204,7 @@ run_solo {defrag} { $rd read ; # Discard script load replies $rd read ; # Discard set replies } - after 120 ;# serverCron only updates the info once in 100ms + after 1000 ;# give defrag some time to work if {$::verbose} { puts "used [s allocator_allocated]" puts "rss [s allocator_active]" @@ -239,6 +240,8 @@ run_solo {defrag} { fail "defrag not started." } + after 1000 ;# Give defrag time to work (might be multiple cycles) + # wait for the active defrag to stop working wait_for_condition 500 100 { [s active_defrag_running] eq 0 @@ -266,7 +269,6 @@ run_solo {defrag} { test "Active defrag big keys: $type" { r flushdb sync r config resetstat - r config set hz 100 r config set activedefrag no r config set active-defrag-max-scan-fields 1000 r config set active-defrag-threshold-lower 5 @@ -361,6 +363,8 @@ run_solo {defrag} { fail "defrag not started." } + after 1000 ;# Give defrag some time to work (it may run several cycles) + # wait for the active defrag to stop working wait_for_condition 500 100 { [s active_defrag_running] eq 0 @@ -407,7 +411,6 @@ run_solo {defrag} { test "Active defrag pubsub: $type" { r flushdb sync r config resetstat - r config set hz 100 r config set activedefrag no r config set active-defrag-threshold-lower 5 r config set active-defrag-cycle-min 65 @@ -430,7 +433,6 @@ run_solo {defrag} { $rd read ; # Discard set replies } - after 120 ;# serverCron only updates the info once in 100ms if {$::verbose} { puts "used [s allocator_allocated]" puts "rss [s allocator_active]" @@ -466,6 +468,8 @@ run_solo {defrag} { fail "defrag not started." } + after 1000 ;# Give defrag some time to work (it may run several cycles) + # wait for the active defrag to stop working wait_for_condition 500 100 { [s active_defrag_running] eq 0 @@ -475,6 +479,7 @@ run_solo {defrag} { puts [r memory malloc-stats] fail "defrag didn't stop." } + r config set activedefrag no ;# disable before we accidentally create more frag # test the fragmentation is lower after 120 ;# serverCron only updates the info once in 100ms @@ -507,7 +512,6 @@ run_solo {defrag} { test "Active defrag big list: $type" { r flushdb sync r config resetstat - r config set hz 100 r config set activedefrag no r config set active-defrag-max-scan-fields 1000 r config set active-defrag-threshold-lower 5 @@ -561,6 +565,8 @@ run_solo {defrag} { fail "defrag not started." } + after 1000 ;# Give defrag some time to work (it may run several cycles) + # wait for the active defrag to stop working wait_for_condition 500 100 { [s active_defrag_running] eq 0 @@ -619,7 +625,6 @@ run_solo {defrag} { start_server {tags {"defrag"} overrides {save ""}} { r flushdb sync r config resetstat - r config set hz 100 r config set activedefrag no r config set active-defrag-max-scan-fields 1000 r config set active-defrag-threshold-lower 5 @@ -685,6 +690,8 @@ run_solo {defrag} { fail "defrag not started." } + after 1000 ;# Give defrag some time to work (it may run several cycles) + # wait for the active defrag to stop working wait_for_condition 500 100 { [s active_defrag_running] eq 0 @@ -720,11 +727,11 @@ run_solo {defrag} { } } - start_cluster 1 0 {tags {"defrag external:skip cluster"} overrides {appendonly yes auto-aof-rewrite-percentage 0 save ""}} { + start_cluster 1 0 {tags {"defrag external:skip cluster"} overrides {appendonly yes auto-aof-rewrite-percentage 0 save "" lazyfree-lazy-user-del no}} { test_active_defrag "cluster" } - start_server {tags {"defrag external:skip standalone"} overrides {appendonly yes auto-aof-rewrite-percentage 0 save ""}} { + start_server {tags {"defrag external:skip standalone"} overrides {appendonly yes auto-aof-rewrite-percentage 0 save "" lazyfree-lazy-user-del no}} { test_active_defrag "standalone" } } ;# run_solo diff --git a/tests/unit/moduleapi/defrag.tcl b/tests/unit/moduleapi/defrag.tcl index e169f8de9b..6d8f55bd06 100644 --- a/tests/unit/moduleapi/defrag.tcl +++ b/tests/unit/moduleapi/defrag.tcl @@ -2,7 +2,6 @@ set testmodule [file normalize tests/modules/defragtest.so] start_server {tags {"modules"} overrides {{save ""}}} { r module load $testmodule 10000 - r config set hz 100 r config set active-defrag-ignore-bytes 1 r config set active-defrag-threshold-lower 0 r config set active-defrag-cycle-min 99 diff --git a/tests/unit/moduleapi/fork.tcl b/tests/unit/moduleapi/fork.tcl index 9d1f9c184c..bf53bd2db8 100644 --- a/tests/unit/moduleapi/fork.tcl +++ b/tests/unit/moduleapi/fork.tcl @@ -26,7 +26,7 @@ start_server {tags {"modules"}} { # module fork twice assert_error {Fork failed} {r fork.create 0 1} - assert {[count_log_message 0 "Can't fork for module: File exists"] eq "1"} + assert {[count_log_message 0 "Can't fork for module: Operation already in progress"] eq "1"} r fork.kill diff --git a/tests/unit/moduleapi/moduleconfigs.tcl b/tests/unit/moduleapi/moduleconfigs.tcl index 44f994d2d0..2474ad3567 100644 --- a/tests/unit/moduleapi/moduleconfigs.tcl +++ b/tests/unit/moduleapi/moduleconfigs.tcl @@ -1,5 +1,15 @@ set testmodule [file normalize tests/modules/moduleconfigs.so] set testmoduletwo [file normalize tests/modules/moduleconfigstwo.so] +set testmoduleparameter [file normalize tests/modules/moduleparameter.so] + +proc module_get_args {mod} { + foreach line [r module list] { + if {[dict get $line name] eq $mod} { + return [dict get $line args] + } + } + throw error {module not found} +} start_server {tags {"modules"}} { r module load $testmodule @@ -243,5 +253,14 @@ start_server {tags {"modules"}} { assert_equal [r config get moduleconfigs.memory_numeric] "moduleconfigs.memory_numeric 1024" } } -} + test {Module Update Args} { + r module load $testmoduleparameter 10 20 30 + set t [r module list] + set modulename [lmap x [r module list] {dict get $x name}] + assert_not_equal [lsearch $modulename moduleparameter] -1 + assert_equal {10 20 30} [module_get_args moduleparameter] + assert_equal OK [r testmoduleparameter.update.parameter 40 50 60 70] + assert_equal {40 50 60 70} [module_get_args moduleparameter] + } +} diff --git a/tests/unit/pause.tcl b/tests/unit/pause.tcl index 38c13afc46..b18a32d48f 100644 --- a/tests/unit/pause.tcl +++ b/tests/unit/pause.tcl @@ -260,6 +260,33 @@ start_server {tags {"pause network"}} { r client unpause } + test "Test eviction is skipped during client pause" { + r flushall + set evicted_keys [s 0 evicted_keys] + + r multi + r set foo{t} bar + r config set maxmemory-policy allkeys-random + r config set maxmemory 1 + r client PAUSE 50000 WRITE + r exec + + # No keys should actually have been evicted. + assert_match $evicted_keys [s 0 evicted_keys] + + # The previous config set triggers a time event, but due to the pause, + # no eviction has been made. After the unpause, a eviction will happen. + r client unpause + wait_for_condition 1000 10 { + [expr $evicted_keys + 1] eq [s 0 evicted_keys] + } else { + fail "Key is not evicted" + } + + r config set maxmemory 0 + r config set maxmemory-policy noeviction + } + test "Test both active and passive expires are skipped during client pause" { set expired_keys [s 0 expired_keys] r multi diff --git a/tests/unit/pubsub.tcl b/tests/unit/pubsub.tcl index 68dc79a4a4..24b78b6e5a 100644 --- a/tests/unit/pubsub.tcl +++ b/tests/unit/pubsub.tcl @@ -109,12 +109,9 @@ start_server {tags {"pubsub network"}} { $rd1 close } - test "UNSUBSCRIBE and PUNSUBSCRIBE from non-subscribed channels" { + test "UNSUBSCRIBE from non-subscribed channels" { set rd1 [valkey_deferring_client] - foreach command {unsubscribe punsubscribe} { - catch {$command $rd1 {foo bar quux}} e - assert_match {*NOSUB*} $e - } + assert_equal {0 0 0} [unsubscribe $rd1 {foo bar quux}] # clean up clients $rd1 close } @@ -204,6 +201,14 @@ start_server {tags {"pubsub network"}} { $rd close } {0} {resp3} + test "PUNSUBSCRIBE from non-subscribed channels" { + set rd1 [valkey_deferring_client] + assert_equal {0 0 0} [punsubscribe $rd1 {foo.* bar.* quux.*}] + + # clean up clients + $rd1 close + } + test "NUMSUB returns numbers, not strings (#1561)" { r pubsub numsub abc def } {abc 0 def 0} @@ -241,6 +246,16 @@ start_server {tags {"pubsub network"}} { $rd1 close } + test "PUNSUBSCRIBE and UNSUBSCRIBE should always reply" { + # Make sure we are not subscribed to any channel at all. + r punsubscribe + r unsubscribe + # Now check if the commands still reply correctly. + set reply1 [r punsubscribe] + set reply2 [r unsubscribe] + concat $reply1 $reply2 + } {punsubscribe {} 0 unsubscribe {} 0} + ### Keyspace events notification tests test "Keyspace notifications: we receive keyspace notifications" { diff --git a/tests/unit/pubsubshard.tcl b/tests/unit/pubsubshard.tcl index d62a415705..e0e1e2972b 100644 --- a/tests/unit/pubsubshard.tcl +++ b/tests/unit/pubsubshard.tcl @@ -74,8 +74,9 @@ start_server {tags {"pubsubshard external:skip"}} { test "SUNSUBSCRIBE from non-subscribed channels" { set rd1 [valkey_deferring_client] - catch {sunsubscribe $rd1 {foo}} e - assert_match {*NOSUB*} $e + assert_equal {0} [sunsubscribe $rd1 {foo}] + assert_equal {0} [sunsubscribe $rd1 {bar}] + assert_equal {0} [sunsubscribe $rd1 {quux}] # clean up clients $rd1 close diff --git a/tests/unit/type/stream-cgroups.tcl b/tests/unit/type/stream-cgroups.tcl index d934e48140..d736b9cdb7 100644 --- a/tests/unit/type/stream-cgroups.tcl +++ b/tests/unit/type/stream-cgroups.tcl @@ -944,7 +944,7 @@ start_server { # Simulate loading from RDB - set reply [r XINFO STREAM x FULL] + set reply [r XINFO STREAM mystream FULL] set group [lindex [dict get $reply groups] 0] set consumer [lindex [dict get $group consumers] 0] set prev_seen [dict get $consumer seen-time] @@ -954,7 +954,7 @@ start_server { r DEL mystream r RESTORE mystream 0 $dump - set reply [r XINFO STREAM x FULL] + set reply [r XINFO STREAM mystream FULL] set group [lindex [dict get $reply groups] 0] set consumer [lindex [dict get $group consumers] 0] assert_equal $prev_seen [dict get $consumer seen-time] diff --git a/utils/releasetools/build-config.json b/utils/releasetools/build-config.json index 5ee9308b3b..5e39fae70f 100644 --- a/utils/releasetools/build-config.json +++ b/utils/releasetools/build-config.json @@ -12,6 +12,12 @@ "type": "deb", "platform": "focal" }, + { + "arch": "x86_64", + "target": "ubuntu24.04", + "type": "deb", + "platform": "noble" + }, { "arch": "arm64", "target": "ubuntu18.04", @@ -23,6 +29,12 @@ "target": "ubuntu20.04", "type": "deb", "platform": "focal" + }, + { + "arch": "arm64", + "target": "ubuntu24.04", + "type": "deb", + "platform": "noble" } ] } \ No newline at end of file diff --git a/valkey.conf b/valkey.conf index 7c7b9da43e..e23aea39de 100644 --- a/valkey.conf +++ b/valkey.conf @@ -300,6 +300,54 @@ tcp-keepalive 300 # # tls-session-cache-timeout 60 +################################### RDMA ###################################### + +# Valkey Over RDMA is experimental, it may be changed or be removed in any minor or major version. +# By default, RDMA is disabled. To enable it, the "rdma-port" configuration +# directive can be used to define RDMA-listening ports. +# +# rdma-port 6379 +# rdma-bind 192.168.1.100 + +# The RDMA receive transfer buffer is 1M by default. It can be set between 64K and 16M. +# Note that page size aligned size is preferred. +# +# rdma-rx-size 1048576 + +# The RDMA completion queue will use the completion vector to signal completion events +# via hardware interrupts. A large number of hardware interrupts can affect CPU performance. +# It is possible to tune the performance using rdma-completion-vector. +# +# Example 1. a) Pin hardware interrupt vectors [0, 3] to CPU [0, 3]. +# b) Set CPU affinity for valkey to CPU [4, X]. +# c) Any valkey server uses a random RDMA completion vector [-1]. +# All valkey servers will not affect each other and will be isolated from kernel interrupts. +# +# SYS SYS SYS SYS VALKEY VALKEY VALKEY +# | | | | | | | +# CPU0 CPU1 CPU2 CPU3 CPU4 CPU5 ... CPUX +# | | | | +# INTR0 INTR1 INTR2 INTR3 +# +# Example 2. a) 1:1 pin hardware interrupt vectors [0, X] to CPU [0, X]. +# b) Set CPU affinity for valkey [M] to CPU [M]. +# c) Valkey server [M] uses RDMA completion vector [M]. +# A single CPU [M] handles hardware interrupts, the RDMA completion vector [M], +# and the valkey server [M] within its context only. +# This avoids overhead and function calls across multiple CPUs, fully isolating +# each valkey server from one another. +# +# VALKEY VALKEY VALKEY VALKEY VALKEY VALKEY VALKEY +# | | | | | | | +# CPU0 CPU1 CPU2 CPU3 CPU4 CPU5 ... CPUX +# | | | | | | | +# INTR0 INTR1 INTR2 INTR3 INTR4 INTR5 INTRX +# +# Use 0 and positive numbers to specify the RDMA completion vector, or specify -1 to allow +# the server to use a random vector for a new connection. The default vector is -1. +# +# rdma-completion-vector 0 + ################################# GENERAL ##################################### # By default the server does not run as a daemon. Use 'yes' if you need it. @@ -534,6 +582,9 @@ rdb-del-sync-files no # The working directory. # +# The server log is written relative this directory, if the 'logfile' +# configuration directive is a relative path. +# # The DB will be written inside this directory, with the filename specified # above using the 'dbfilename' configuration directive. # @@ -543,6 +594,9 @@ rdb-del-sync-files no # 'cluster-config-file' configuration directive is a relative path. # # Note that you must specify a directory here, not a file name. +# Note that modifying 'dir' during runtime may have unexpected behavior, +# for example when a child process is running, related file operations may +# have unexpected effects. dir ./ ################################# REPLICATION ################################# @@ -818,6 +872,13 @@ replica-priority 100 # # replica-ignore-disk-write-errors no +# Make the primary forbid expiration and eviction. +# This is useful for sync tools, because expiration and eviction may cause the data corruption. +# Sync tools can mark their connections as importing source by CLIENT IMPORT-SOURCE. +# NOTICE: Clients should avoid writing the same key on the source server and the destination server. +# +# import-mode no + # ----------------------------------------------------------------------------- # By default, Sentinel includes all replicas in its reports. A replica # can be excluded from Sentinel's announcements. An unannounced replica @@ -2326,9 +2387,8 @@ rdb-save-incremental-fsync yes # Fragmentation is a natural process that happens with every allocator (but # less so with Jemalloc, fortunately) and certain workloads. Normally a server # restart is needed in order to lower the fragmentation, or at least to flush -# away all the data and create it again. However thanks to this feature -# implemented by Oran Agra, this process can happen at runtime -# in a "hot" way, while the server is running. +# away all the data and create it again. However thanks to this feature, this +# process can happen at runtime in a "hot" way, while the server is running. # # Basically when the fragmentation is over a certain level (see the # configuration options below) the server will start to create new copies of the @@ -2366,18 +2426,23 @@ rdb-save-incremental-fsync yes # Maximum percentage of fragmentation at which we use maximum effort # active-defrag-threshold-upper 100 -# Minimal effort for defrag in CPU percentage, to be used when the lower -# threshold is reached +# Minimal effort for defrag in CPU percentage, not cycle time as the name might +# suggest, to be used when the lower threshold is reached. # active-defrag-cycle-min 1 -# Maximal effort for defrag in CPU percentage, to be used when the upper -# threshold is reached +# Maximal effort for defrag in CPU percentage, not cycle time as the name might +# suggest, to be used when the upper threshold is reached. # active-defrag-cycle-max 25 # Maximum number of set/hash/zset/list fields that will be processed from # the main dictionary scan # active-defrag-max-scan-fields 1000 +# The time spent (in microseconds) of the periodic active defrag process. This +# affects the latency impact of active defrag on client commands. Smaller numbers +# will result in less latency impact at the cost of increased defrag overhead. +# active-defrag-cycle-us 500 + # Jemalloc background thread for purging will be enabled by default jemalloc-bg-thread yes