diff --git a/.github/workflows/topling-jni.yml b/.github/workflows/topling-jni.yml
new file mode 100644
index 0000000000..3079515167
--- /dev/null
+++ b/.github/workflows/topling-jni.yml
@@ -0,0 +1,129 @@
+# TODO: How to cache make files / speed up build progress here?
+name: "build topling-jni"
+
+on:
+ workflow_dispatch:
+ inputs:
+ repository_url:
+ required: true
+ default: 'topling/toplingdb'
+ repository_branch:
+ required: false
+ default: 'sideplugin-7.10.0-2022-12-21-bec42648'
+ test:
+ required: false
+ type: boolean
+ description: test SideGetBenchmarks
+ default: false
+ deploy_maven:
+ required: false
+ type: boolean
+ description: publish to maven repo
+ default: true
+
+jobs:
+ build:
+ # refer https://github.com/actions/runner-images to get the details
+ runs-on: ubuntu-latest
+ env:
+ GCC_VER: "11.3" # TODO: better get from the 'gcc --version'
+ GITHUB_TOKEN: ${{ github.token }}
+ REP_URL: ${{ inputs.repository_url }}
+ permissions:
+ contents: read
+ packages: write
+
+ steps:
+ - name: Checkout
+ uses: actions/checkout@v3
+ with:
+ repository: ${{ inputs.repository_url }}
+ ref: ${{ inputs.repository_branch }}
+ fetch-depth: 1
+
+ - name: Set up JDK 11
+ uses: actions/setup-java@v3
+ with:
+ java-version: '11'
+ distribution: 'temurin'
+ cache: maven
+ server-id: github # Value of the distributionManagement/repository/id field of the pom.xml
+ settings-path: ${{ github.workspace }} # location for the settings.xml file
+ #- name: Cache Maven # Replace by setup-java now
+ # uses: actions/cache@v3
+ # with:
+ # path: ~/.m2/repository
+ # key: ${{ runner.os }}-m2-${{ hashFiles('**/pom.xml') }}
+ # restore-keys: ${{ runner.os }}-m2
+
+ - name: Init Env & Compile RocksDB
+ run: |
+ cat $GITHUB_WORKSPACE/settings.xml
+ sudo apt-get update -y && sudo apt-get install -y \
+ libjemalloc-dev libaio-dev libgflags-dev zlib1g-dev \
+ libbz2-dev libcurl4-gnutls-dev liburing-dev \
+ libsnappy-dev libbz2-dev liblz4-dev libzstd-dev
+
+ gcc --version
+ git submodule update --init --recursive
+ mkdir -p ~/.ssh && mkdir -p /opt/lib
+ ssh-keyscan -t rsa github.com >> ~/.ssh/known_hosts
+ # this step could take a long time?
+ make -j`nproc` DEBUG_LEVEL=0 shared_lib
+ sudo make install-shared PREFIX=/opt
+ ls -l /opt/lib
+
+ - name: Compile RocksDBJava
+ run: |
+ echo $JAVA_HOME
+ make rocksdbjava -j`nproc` DEBUG_LEVEL=0
+
+ - name: Move to Local Maven Repo
+ run: |
+ cd java/target || exit
+ cp -v rocksdbjni-7.10.0-linux64.jar rocksdbjni-7.10.0-SNAPSHOT-linux64.jar
+ mvn install:install-file -ntp -Dfile=rocksdbjni-7.10.0-SNAPSHOT-linux64.jar \
+ -DgroupId=org.rocksdb -DartifactId=rocksdbjni -Dversion=7.10.0-SNAPSHOT -Dpackaging=jar
+ # TODO: why 'deploy' doesn't include install step here? if we only use deploy, will lack local jar
+ if ${{ inputs.deploy_maven }}; then
+ # TODO: what's the pom file for it? add with '-DpomFile=/xx/pom.xml'
+ mvn deploy:deploy-file -e -s $GITHUB_WORKSPACE/settings.xml \
+ -DpomFile=$GITHUB_WORKSPACE/java/pom.xml.template \
+ -Durl=https://maven.pkg.github.com/$REP_URL -DrepositoryId=github \
+ -Dfile=rocksdbjni-7.10.0-SNAPSHOT-linux64.jar -DgroupId=org.rocksdb \
+ -DartifactId=rocksdbjni -Dversion=7.10.0-SNAPSHOT -Dpackaging=jar
+ fi
+
+ # for compile jmh.jar to test the performance
+ - name: Build SideGetBenchmarks with Maven
+ run: |
+ echo ${{ github.workspace }} && echo $GITHUB_WORKSPACE
+ pwd && ls -l
+ (cd java/jmh && ls -l && pwd) || exit
+ mvn clean package -e -ntp -f $GITHUB_WORKSPACE/java/jmh/pom.xml # -B in non-interactive (Batch) mode
+
+ - name: Run SideGetBenchmarks & Check it
+ if: ${{ inputs.test }}
+ run: |
+ mkdir -p /dev/shm/db_bench_community
+ cd $GITHUB_WORKSPACE/java/jmh || exit
+ ls ../../sideplugin/rockside/src/topling/web
+ cp -v $GITHUB_WORKSPACE/sideplugin/rockside/src/topling/web/{style.css,index.html} /dev/shm/db_bench_community
+ echo $LD_LIBRARY_PATH
+ export LD_LIBRARY_PATH=/opt/lib:$LD_LIBRARY_PATH # for libterark-*
+ echo $LD_LIBRARY_PATH && ls -l /opt/lib
+ # Note: webserver should visit while running
+ export LD_PRELOAD=libterark-zbs-g++-11.3-r.so:libterark-fsa-g++-11.3-r.so:libjemalloc.so
+ java -jar target/rocksdbjni-jmh-1.0-SNAPSHOT-benchmarks.jar \
+ -p keyCount=1000 -p keySize=128 -p valueSize=32768 \
+ -p sideConf=$GITHUB_WORKSPACE/sideplugin/rockside/sample-conf/db_bench_community.yaml SideGetBenchmarks
+
+ - name: Publish JAR to GitHub Packages
+ if: ${{ inputs.deploy_maven }}
+ run: |
+ cd $GITHUB_WORKSPACE/java/jmh || exit
+ ls -l $GITHUB_WORKSPACE && tail -15 pom.xml
+ mvn deploy -e -f $GITHUB_WORKSPACE/java/jmh/pom.xml -s $GITHUB_WORKSPACE/settings.xml \
+ -DaltDeploymentRepository=github::default::https://maven.pkg.github.com/$REP_URL
+ #env:
+ # GITHUB_TOKEN: ${{ github.token }}
diff --git a/.gitignore b/.gitignore
index 8bd9fea598..09da7844d0 100644
--- a/.gitignore
+++ b/.gitignore
@@ -7,6 +7,7 @@ rocksdb.pc
*.dylib*
*.gcda
*.gcno
+*.log
*.o
*.o.tmp
*.so
@@ -25,11 +26,13 @@ rocksdb.pc
*.vcxproj
*.vcxproj.filters
*.sln
+*.sst
*.cmake
.watchmanconfig
CMakeCache.txt
CMakeFiles/
build/
+build-ut/
ldb
manifest_dump
@@ -98,3 +101,8 @@ cmake-build-*
third-party/folly/
.cache
*.sublime-*
+*_dbg
+*_test
+
+generated-sources
+target
diff --git a/.gitmodules b/.gitmodules
new file mode 100644
index 0000000000..791e51fd91
--- /dev/null
+++ b/.gitmodules
@@ -0,0 +1,3 @@
+[submodule "sideplugin/rockside"]
+ path = sideplugin/rockside
+ url = https://github.com/topling/rockside.git
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 49868be894..ff6f5007a7 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -171,6 +171,8 @@ else()
endif()
endif()
+include_directories(sideplugin/rockside/src)
+
option(WITH_MD_LIBRARY "build with MD" ON)
if(WIN32 AND MSVC)
if(WITH_MD_LIBRARY)
@@ -181,6 +183,11 @@ if(WIN32 AND MSVC)
endif()
if(MSVC)
+ if(MSVC_VERSION LESS 1926)
+ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /experimental:preprocessor")
+ else()
+ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /Zc:preprocessor")
+ endif()
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /Zi /nologo /EHsc /GS /Gd /GR /GF /fp:precise /Zc:wchar_t /Zc:forScope /errorReport:queue")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /FC /d2Zi+ /W4 /wd4127 /wd4800 /wd4996 /wd4351 /wd4100 /wd4204 /wd4324")
else()
@@ -613,8 +620,68 @@ endif()
find_package(Threads REQUIRED)
# Main library source code
+if (EXISTS ${PROJECT_SOURCE_DIR}/sideplugin/topling-rocks/CMakeFileList.txt)
+ message(STATUS "found ${PROJECT_SOURCE_DIR}/sideplugin/topling-rocks/CMakeFileList.txt")
+ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
+ include(${PROJECT_SOURCE_DIR}/sideplugin/topling-rocks/CMakeFileList.txt)
+else()
+ message(STATUS "not found ${PROJECT_SOURCE_DIR}/sideplugin/topling-rocks/CMakeFileList.txt")
+endif()
+
+if (EXISTS ${PROJECT_SOURCE_DIR}/sideplugin/rockside/CMakeFileList.txt)
+ message(STATUS "found ${PROJECT_SOURCE_DIR}/sideplugin/rockside/CMakeFileList.txt")
+ include(${PROJECT_SOURCE_DIR}/sideplugin/rockside/CMakeFileList.txt)
+else()
+ message(STATUS "not found ${PROJECT_SOURCE_DIR}/sideplugin/rockside/CMakeFileList.txt")
+endif()
+
+set (cspp_memtab ${PROJECT_SOURCE_DIR}/sideplugin/cspp-memtable/cspp_memtable.cc)
+if (EXISTS ${cspp_memtab})
+ message(STATUS "found ${cspp_memtab}")
+ set (topling_rocks_src ${topling_rocks_src} ${cspp_memtab})
+ set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DHAS_TOPLING_CSPP_MEMTABLE")
+else()
+ message(STATUS "not found ${cspp_memtab}")
+endif()
+
+set (cspp_wbwi ${PROJECT_SOURCE_DIR}/sideplugin/cspp-wbwi/cspp_wbwi.cc)
+if (EXISTS ${cspp_wbwi})
+ message(STATUS "found ${cspp_wbwi}")
+ set (topling_rocks_src ${topling_rocks_src} ${cspp_wbwi})
+ set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DHAS_TOPLING_CSPP_WBWI")
+else()
+ message(STATUS "not found ${cspp_wbwi}")
+endif()
+
+FILE(GLOB topling_sst ${PROJECT_SOURCE_DIR}/sideplugin/topling-sst/src/table/*.cc)
+if (EXISTS ${PROJECT_SOURCE_DIR}/sideplugin/topling-sst/src/table)
+ message(STATUS "found ${topling_sst}")
+ set (topling_rocks_src ${topling_rocks_src} ${topling_sst})
+ set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DHAS_TOPLING_SST -Isideplugin/topling-sst/src")
+else()
+ message(STATUS "not found ${topling_sst}")
+endif()
+
+FILE(GLOB topling_zip_table_reader ${PROJECT_SOURCE_DIR}/sideplugin/topling-zip_table_reader/src/table/*.cc)
+if (EXISTS ${PROJECT_SOURCE_DIR}/sideplugin/topling-zip_table_reader/src/table)
+ message(STATUS "found ${topling_zip_table_reader}")
+ set (topling_rocks_src ${topling_rocks_src} ${topling_zip_table_reader})
+ set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Isideplugin/topling-zip_table_reader/src")
+else()
+ message(STATUS "not found ${topling_zip_table_reader}")
+endif()
+
+FILE(GLOB topling_dcompact ${PROJECT_SOURCE_DIR}/sideplugin/topling-dcompact/src/dcompact/*.cc)
+if (EXISTS ${PROJECT_SOURCE_DIR}/sideplugin/topling-dcompact/src/dcompact)
+ message(STATUS "found ${topling_dcompact}")
+ set (topling_rocks_src ${topling_rocks_src} ${topling_dcompact})
+else()
+ message(STATUS "not found ${topling_dcompact}")
+endif()
set(SOURCES
+ ${rockside_src}
+ ${topling_rocks_src}
cache/cache.cc
cache/cache_entry_roles.cc
cache/cache_key.cc
@@ -646,6 +713,7 @@ set(SOURCES
db/builder.cc
db/c.cc
db/column_family.cc
+ db/compaction/compaction_executor.cc
db/compaction/compaction.cc
db/compaction/compaction_iterator.cc
db/compaction/compaction_picker.cc
diff --git a/INSTALL.md b/INSTALL.md
index fb4651e4b8..716a53fb4b 100644
--- a/INSTALL.md
+++ b/INSTALL.md
@@ -32,6 +32,15 @@ most processors made since roughly 2013.
## Dependencies
+* ToplingDB dependencies
+ - [libcurl](https://curl.se/libcurl/) - libcurl is a free and easy-to-use client-side URL transfer library
+ * ToplingDB [dcompact](https://github.com/topling/topling-dcompact) use libcurl to submit compaction jobs to compaction service(dcompact_worker)
+ - [liburing](https://github.com/axboe/liburing) - the io_uring library, ToplingDB use it to optimize MultiGet
+ * ToplingDB adds `ReadOptions::async_queue_depth` for queue depth of io_uring
+ * When compiled to shared library, this is not needed - it's used in [topling-zip](https://github.com/topling/topling-zip)
+ - [libaio](https://pagure.io/libaio) - The Linux-native asynchronous I/O facility
+ * libaio is old linux async io, io_uring should be preferred than libaio
+
* You can link RocksDB with following compression libraries:
- [zlib](http://www.zlib.net/) - a library for data compression.
- [bzip2](http://www.bzip.org/) - a library for data compression.
diff --git a/LICENSE.Apache b/LICENSE.Apache
index d645695673..261eeb9e9f 100644
--- a/LICENSE.Apache
+++ b/LICENSE.Apache
@@ -1,4 +1,3 @@
-
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
diff --git a/Makefile b/Makefile
index 8829be9d85..96d53fb507 100644
--- a/Makefile
+++ b/Makefile
@@ -18,6 +18,15 @@ MACHINE ?= $(shell uname -m)
ARFLAGS = ${EXTRA_ARFLAGS} rs
STRIPFLAGS = -S -x
+# beg topling specific
+DISABLE_WARNING_AS_ERROR=1
+LIB_MODE=shared
+USE_RTTI=1
+ROCKSDB_USE_IO_URING=0
+ROCKSDB_DISABLE_TCMALLOC=1
+SKIP_FORMAT_BUCK_CHECKS=1
+# end topling specific
+
# Transform parallel LOG output into something more readable.
perl_command = perl -n \
-e '@a=split("\t",$$_,-1); $$t=$$a[8];' \
@@ -74,6 +83,8 @@ else ifneq ($(filter jtest rocksdbjava%, $(MAKECMDGOALS)),)
endif
endif
+$(info $$DEBUG_LEVEL is ${DEBUG_LEVEL}, MAKE_RESTARTS is [${MAKE_RESTARTS}])
+
# LIB_MODE says whether or not to use/build "shared" or "static" libraries.
# Mode "static" means to link against static libraries (.a)
# Mode "shared" means to link against shared libraries (.so, .sl, .dylib, etc)
@@ -99,11 +110,18 @@ dummy := $(shell (export ROCKSDB_ROOT="$(CURDIR)"; \
export COMPILE_WITH_UBSAN="$(COMPILE_WITH_UBSAN)"; \
export PORTABLE="$(PORTABLE)"; \
export ROCKSDB_NO_FBCODE="$(ROCKSDB_NO_FBCODE)"; \
+ export ROCKSDB_USE_IO_URING="$(ROCKSDB_USE_IO_URING)"; \
+ export ROCKSDB_DISABLE_TCMALLOC="$(ROCKSDB_DISABLE_TCMALLOC)"; \
+ export ROCKSDB_DISABLE_ZSTD=1; \
export USE_CLANG="$(USE_CLANG)"; \
export LIB_MODE="$(LIB_MODE)"; \
- export ROCKSDB_CXX_STANDARD="$(ROCKSDB_CXX_STANDARD)"; \
- export USE_FOLLY="$(USE_FOLLY)"; \
+ export ROCKSDB_CXX_STANDARD="$(ROCKSDB_CXX_STANDARD)"; \
+ export USE_FOLLY="$(USE_FOLLY)"; \
"$(CURDIR)/build_tools/build_detect_platform" "$(CURDIR)/make_config.mk"))
+ifneq (${.SHELLSTATUS},0)
+ $(error $(CURDIR)/build_tools/build_detect_platform failed with exit code ${.SHELLSTATUS})
+endif
+
# this file is generated by the previous line to set build flags and sources
include make_config.mk
@@ -117,12 +135,14 @@ OPT += $(OPTIMIZE_LEVEL)
# compile with -O2 if debug level is not 2
ifneq ($(DEBUG_LEVEL), 2)
+ifeq ($(WITH_FRAME_POINTER),1)
OPT += -fno-omit-frame-pointer
# Skip for archs that don't support -momit-leaf-frame-pointer
ifeq (,$(shell $(CXX) -fsyntax-only -momit-leaf-frame-pointer -xc /dev/null 2>&1))
OPT += -momit-leaf-frame-pointer
endif
endif
+endif
ifeq (,$(shell $(CXX) -fsyntax-only -maltivec -xc /dev/null 2>&1))
CXXFLAGS += -DHAS_ALTIVEC
@@ -209,6 +229,330 @@ endif
#-----------------------------------------------
include src.mk
+# ROCKSDB_NO_DYNAMIC_EXTENSION makes dll load twice, disable it
+CXXFLAGS += -DROCKSDB_NO_DYNAMIC_EXTENSION
+
+# civetweb show server stats
+CXXFLAGS += -DUSE_SERVER_STATS=1
+CFLAGS += -DUSE_SERVER_STATS=1
+
+# civetweb-v1.15 requires OPENSSL_API_1_1 or OPENSSL_API_1_0
+CXXFLAGS += -DOPENSSL_API_1_1=1
+CFLAGS += -DOPENSSL_API_1_1=1
+
+ifneq ($(filter check_% check-% %_tests %_test %_test2 \
+ watch-log format clean% tags% \
+ package% install install-%, \
+ $(MAKECMDGOALS)),)
+ UPDATE_REPO ?= 0
+endif
+
+ifeq (,$(wildcard sideplugin/rockside/3rdparty/rapidyaml))
+ $(warning NotFound sideplugin/rockside/3rdparty/rapidyaml)
+ $(warning sideplugin/rockside is a submodule, auto init...)
+ IsCloneOK := $(shell \
+ set -x -e; \
+ git submodule update --init --recursive >&2; \
+ echo $$?\
+ )
+ ifneq ("${IsCloneOK}","0")
+ $(error "IsCloneOK=${IsCloneOK} Error cloning rockside, stop!")
+ endif
+else
+ ifneq (${UPDATE_REPO},0)
+ ifeq (${MAKE_RESTARTS},)
+ dummy := $(shell set -ex; git pull && git submodule update --init --recursive)
+ endif
+ endif
+endif
+EXTRA_LIB_SOURCES += sideplugin/rockside/src/topling/rapidyaml_all.cc
+CXXFLAGS += -Isideplugin/rockside/3rdparty/rapidyaml \
+ -Isideplugin/rockside/3rdparty/rapidyaml/src \
+ -Isideplugin/rockside/3rdparty/rapidyaml/ext/c4core/src \
+ -DSIDE_PLUGIN_WITH_YAML=1
+
+# topling-core is topling private
+ifneq (,$(wildcard sideplugin/topling-core))
+ TOPLING_CORE_DIR := sideplugin/topling-core
+ CXXFLAGS += -DGITHUB_TOPLING_ZIP='"https://github.com/rockeet/topling-core"'
+else
+ CXXFLAGS += -DGITHUB_TOPLING_ZIP='"https://github.com/topling/topling-zip"'
+ # topling-zip is topling public
+ ifeq (,$(wildcard sideplugin/topling-zip))
+ $(warning sideplugin/topling-zip is not present, clone it from github...)
+ IsCloneOK := $(shell \
+ set -x -e; \
+ cd sideplugin; \
+ git clone https://github.com/topling/topling-zip.git >&2; \
+ cd topling-zip; \
+ git submodule update --init --recursive >&2; \
+ echo $$?\
+ )
+ ifneq ("${IsCloneOK}","0")
+ $(error "IsCloneOK=${IsCloneOK} Error cloning topling-zip, stop!")
+ endif
+ else
+ ifneq (${UPDATE_REPO},0)
+ ifeq (${MAKE_RESTARTS},)
+ dummy := $(shell set -ex; cd sideplugin/topling-zip && \
+ git pull && git submodule update --init --recursive)
+ endif
+ endif
+ endif
+ TOPLING_CORE_DIR := sideplugin/topling-zip
+endif
+
+COMPILER := $(shell set -e; tmpfile=`mktemp -u compiler-XXXXXX`; \
+ ${CXX} ${TOPLING_CORE_DIR}/tools/configure/compiler.cpp -o $${tmpfile}.exe; \
+ ./$${tmpfile}.exe && rm -f $${tmpfile}*)
+UNAME_MachineSystem := $(shell uname -m -s | sed 's:[ /]:-:g')
+WITH_BMI2 ?= $(shell bash ${TOPLING_CORE_DIR}/cpu_has_bmi2.sh)
+BUILD_NAME := ${UNAME_MachineSystem}-${COMPILER}-bmi2-${WITH_BMI2}
+BUILD_ROOT := build/${BUILD_NAME}
+ifeq (${DEBUG_LEVEL}, 0)
+ BUILD_TYPE_SIG := r
+ OBJ_DIR := ${BUILD_ROOT}/rls
+endif
+ifeq (${DEBUG_LEVEL}, 1)
+ BUILD_TYPE_SIG := a
+ OBJ_DIR := ${BUILD_ROOT}/afr
+endif
+ifeq (${DEBUG_LEVEL}, 2)
+ BUILD_TYPE_SIG := d
+ OBJ_DIR := ${BUILD_ROOT}/dbg
+endif
+ifneq ($(filter auto_all_tests check check_0 watch-log gen_parallel_tests %_test %_test2, $(MAKECMDGOALS)),)
+ CXXFLAGS += -DROCKSDB_UNIT_TEST
+ CXXFLAGS += -DROCKSDB_DYNAMIC_CREATE_CF
+ CXXFLAGS += -DTOPLINGDB_WITH_TIMESTAMP
+ CXXFLAGS += -DTOPLINGDB_WITH_WIDE_COLUMNS
+ MAKE_UNIT_TEST := 1
+ OBJ_DIR := $(subst build/,build-ut/,${OBJ_DIR})
+endif
+
+# 1. we define ROCKSDB_DISABLE_ZSTD=1 on build_detect_platform.
+# 2. zstd lib is included in libterark-zbs
+# 3. we alway use ZSTD
+CXXFLAGS += -DZSTD \
+ -I${TOPLING_CORE_DIR}/3rdparty/zstd/zstd \
+ -I${TOPLING_CORE_DIR}/3rdparty/zstd/zstd/dictBuilder
+
+CXXFLAGS += \
+ -I${TOPLING_CORE_DIR}/src \
+ -I${TOPLING_CORE_DIR}/boost-include \
+ -I${TOPLING_CORE_DIR}/3rdparty/zstd
+
+LDFLAGS += -L${TOPLING_CORE_DIR}/${BUILD_ROOT}/lib_shared \
+ -lterark-{zbs,fsa,core}-${COMPILER}-${BUILD_TYPE_SIG}
+
+ifndef WITH_TOPLING_ROCKS
+ # auto check
+ ifeq (,$(wildcard sideplugin/topling-rocks))
+ # topling specific: just for people who has permission to topling-rocks
+ dummy := $(shell set -e -x; \
+ cd sideplugin; \
+ git clone git@github.com:rockeet/topling-rocks; \
+ cd topling-rocks; \
+ git submodule update --init --recursive \
+ )
+ endif
+ ifeq (,$(wildcard sideplugin/topling-rocks))
+ WITH_TOPLING_ROCKS := 0
+ else
+ WITH_TOPLING_ROCKS := 1
+ endif
+endif
+
+ifeq (${WITH_TOPLING_ROCKS},1)
+ifeq (,$(wildcard sideplugin/topling-rocks))
+ # topling specific: just for people who has permission to topling-rocks
+ dummy := $(shell set -e -x; \
+ cd sideplugin; \
+ git clone git@github.com:rockeet/topling-rocks; \
+ cd topling-rocks; \
+ git submodule update --init --recursive \
+ )
+else
+ ifneq (${UPDATE_REPO},0)
+ ifeq (${MAKE_RESTARTS},)
+ dummy := $(shell set -ex; cd sideplugin/topling-rocks && git pull)
+ endif
+ endif
+endif
+ifeq (,$(wildcard sideplugin/topling-rocks/src/table/top_zip_table_builder.cc))
+ $(error WITH_TOPLING_ROCKS=1 but repo sideplugin/topling-rocks is broken)
+endif
+endif
+
+ifeq (,$(wildcard sideplugin/cspp-memtable))
+ # topling specific: just for people who has permission to cspp-memtable
+ dummy := $(shell set -e -x; \
+ cd sideplugin; \
+ git clone https://github.com/topling/cspp-memtable; \
+ cd cspp-memtable; \
+ )
+else
+ ifneq (${UPDATE_REPO},0)
+ ifeq (${MAKE_RESTARTS},)
+ dummy := $(shell set -ex; cd sideplugin/cspp-memtable && git pull)
+ endif
+ endif
+endif
+ifeq (,$(wildcard sideplugin/cspp-wbwi))
+ dummy := $(shell set -e -x; \
+ cd sideplugin; \
+ git clone https://github.com/topling/cspp-wbwi; \
+ cd cspp-wbwi; \
+ )
+else
+ ifneq (${UPDATE_REPO},0)
+ ifeq (${MAKE_RESTARTS},)
+ dummy := $(shell set -ex; cd sideplugin/cspp-wbwi && git pull)
+ endif
+ endif
+endif
+
+ifneq (,$(wildcard sideplugin/cspp-memtable))
+ # now we have cspp-memtable
+ CXXFLAGS += -DHAS_TOPLING_CSPP_MEMTABLE
+ CSPP_MEMTABLE_GIT_VER_SRC = ${BUILD_ROOT}/git-version-cspp_memtable.cc
+ EXTRA_LIB_SOURCES += sideplugin/cspp-memtable/cspp_memtable.cc \
+ sideplugin/cspp-memtable/${CSPP_MEMTABLE_GIT_VER_SRC}
+else
+ $(warning NotFound sideplugin/cspp-memtable, this is ok, only Topling CSPP MemTab is disabled)
+endif
+
+ifneq (,$(wildcard sideplugin/cspp-wbwi))
+ # now we have cspp-wbwi
+ CXXFLAGS += -DHAS_TOPLING_CSPP_WBWI
+ CSPP_WBWI_GIT_VER_SRC = ${BUILD_ROOT}/git-version-cspp_wbwi.cc
+ EXTRA_LIB_SOURCES += sideplugin/cspp-wbwi/cspp_wbwi.cc \
+ sideplugin/cspp-wbwi/${CSPP_WBWI_GIT_VER_SRC}
+else
+ $(warning NotFound sideplugin/cspp-wbwi, this is ok, only Topling CSPP WBWI(WriteBatchWithIndex) is disabled)
+endif
+
+ifeq (,$(wildcard sideplugin/topling-sst/src/table))
+ dummy := $(shell set -e -x; \
+ cd sideplugin; \
+ git clone https://github.com/topling/topling-sst; \
+ cd topling-sst; \
+ )
+else
+ ifneq (${UPDATE_REPO},0)
+ ifeq (${MAKE_RESTARTS},)
+ dummy := $(shell cd sideplugin/topling-sst && git pull)
+ endif
+ endif
+endif
+ifneq (,$(wildcard sideplugin/topling-sst/src/table))
+ # now we have topling-sst
+ CXXFLAGS += -DHAS_TOPLING_SST -Isideplugin/topling-sst/src
+ TOPLING_SST_GIT_VER_SRC = ${BUILD_ROOT}/git-version-topling_sst.cc
+ EXTRA_LIB_SOURCES += $(wildcard sideplugin/topling-sst/src/table/*.cc) \
+ sideplugin/topling-sst/${TOPLING_SST_GIT_VER_SRC}
+else
+ $(warning NotFound sideplugin/topling-sst, this is ok, only Topling Open SST(s) are disabled)
+endif
+
+ifeq (,$(wildcard sideplugin/topling-zip_table_reader/src/table))
+ dummy := $(shell set -e -x; \
+ cd sideplugin; \
+ git clone https://github.com/topling/topling-zip_table_reader; \
+ cd topling-zip_table_reader; \
+ )
+else
+ ifneq (${UPDATE_REPO},0)
+ ifeq (${MAKE_RESTARTS},)
+ dummy := $(shell cd sideplugin/topling-zip_table_reader && git pull)
+ endif
+ endif
+endif
+ifneq (,$(wildcard sideplugin/topling-zip_table_reader/src/table))
+ # now we have topling-zip_table_reader
+ CXXFLAGS += -DHAS_TOPLING_SST -Isideplugin/topling-zip_table_reader/src
+ TOPLING_ZIP_TABLE_READER_GIT_VER_SRC = ${BUILD_ROOT}/git-version-topling_zip_table_reader.cc
+ EXTRA_LIB_SOURCES += $(wildcard sideplugin/topling-zip_table_reader/src/table/*.cc) \
+ sideplugin/topling-zip_table_reader/${TOPLING_ZIP_TABLE_READER_GIT_VER_SRC}
+else
+ $(warning NotFound sideplugin/topling-zip_table_reader, this is ok, only Topling Open SST(s) are disabled)
+endif
+
+
+ifeq (,$(wildcard sideplugin/topling-dcompact/src/dcompact))
+ dummy := $(shell set -e -x; \
+ cd sideplugin; \
+ git clone https://github.com/topling/topling-dcompact; \
+ cd topling-dcompact; \
+ )
+else
+ ifneq (${UPDATE_REPO},0)
+ ifeq (${MAKE_RESTARTS},)
+ dummy := $(shell cd sideplugin/topling-dcompact && git pull)
+ endif
+ endif
+endif
+ifneq (,$(wildcard sideplugin/topling-dcompact/src/dcompact))
+ # now we have topling-dcompact
+ #CXXFLAGS += -Isideplugin/topling-dcompact/src
+ LDFLAGS += -lstdc++fs -lcurl
+ TOPLING_DCOMPACT_GIT_VER_SRC := ${BUILD_ROOT}/git-version-topling_dcompact.cc
+ EXTRA_LIB_SOURCES += $(wildcard sideplugin/topling-dcompact/src/dcompact/*.cc) \
+ sideplugin/topling-dcompact/${TOPLING_DCOMPACT_GIT_VER_SRC}
+else
+ $(warning NotFound sideplugin/topling-dcompact, this is ok, only topling-dcompact is disabled)
+endif
+
+export LD_LIBRARY_PATH:=${TOPLING_CORE_DIR}/${BUILD_ROOT}/lib_shared:${LD_LIBRARY_PATH}
+ifeq (${WITH_TOPLING_ROCKS},1)
+ifneq (,$(wildcard sideplugin/topling-rocks))
+ CXXFLAGS += -I sideplugin/topling-rocks/src
+ TOPLING_ROCKS_GIT_VER_SRC = ${BUILD_ROOT}/git-version-topling_rocks.cc
+ EXTRA_LIB_SOURCES += \
+ $(wildcard sideplugin/topling-rocks/src/table/*.cc) \
+ sideplugin/topling-rocks/${TOPLING_ROCKS_GIT_VER_SRC}
+else
+ $(warning NotFound sideplugin/topling-rocks, this is ok, only ToplingZipTable is disabled)
+endif
+endif
+
+TOPLING_DCOMPACT_USE_ETCD := 0
+ifneq (,$(wildcard sideplugin/topling-dcompact/3rdparty/etcd-cpp-apiv3/build/src/libetcd-cpp-api.${PLATFORM_SHARED_EXT}))
+ifneq (,$(wildcard sideplugin/topling-dcompact/3rdparty/etcd-cpp-apiv3/build/proto/gen/proto))
+ CXXFLAGS += -I sideplugin/topling-dcompact/3rdparty/etcd-cpp-apiv3/build/proto/gen/proto \
+ -I sideplugin/topling-dcompact/3rdparty/etcd-cpp-apiv3
+ LDFLAGS += -L sideplugin/topling-dcompact/3rdparty/etcd-cpp-apiv3/build/src -letcd-cpp-api
+ export LD_LIBRARY_PATH:=${TOPLING_ROCKS_DIR}/3rdparty/etcd-cpp-apiv3/build/src:${LD_LIBRARY_PATH}
+ ifneq (,$(wildcard ../vcpkg/packages/grpc_x64-linux/include))
+ CXXFLAGS += -I ../vcpkg/packages/grpc_x64-linux/include
+ else
+ $(error NotFound ../vcpkg/packages/grpc_x64-linux/include)
+ endif
+ ifneq (,$(wildcard ../vcpkg/packages/protobuf_x64-linux/include))
+ CXXFLAGS += -I ../vcpkg/packages/protobuf_x64-linux/include
+ else
+ $(error NotFound ../vcpkg/packages/protobuf_x64-linux/include)
+ endif
+ ifneq (,$(wildcard ../vcpkg/packages/cpprestsdk_x64-linux/include))
+ CXXFLAGS += -I ../vcpkg/packages/cpprestsdk_x64-linux/include
+ else
+ $(error NotFound ../vcpkg/packages/cpprestsdk_x64-linux/include)
+ endif
+ CXXFLAGS += -DTOPLING_DCOMPACT_USE_ETCD
+ TOPLING_DCOMPACT_USE_ETCD := 1
+endif
+endif
+
+#ifeq (${TOPLING_DCOMPACT_USE_ETCD},0)
+# $(warning NotFound etcd-cpp-apiv3, this is ok, only etcd is disabled)
+#endif
+
+#export ROCKSDB_KICK_OUT_OPTIONS_FILE=1
+
+# prepend EXTRA_LIB_SOURCES to LIB_SOURCES because
+# EXTRA_LIB_SOURCES single file compiling is slow
+LIB_SOURCES := ${EXTRA_LIB_SOURCES} ${LIB_SOURCES}
+
AM_DEFAULT_VERBOSITY ?= 0
AM_V_GEN = $(am__v_GEN_$(V))
@@ -240,7 +584,7 @@ am__v_AR_0 = @echo " AR " $@;
am__v_AR_1 =
AM_LINK = $(AM_V_CCLD)$(CXX) -L. $(patsubst lib%.a, -l%, $(patsubst lib%.$(PLATFORM_SHARED_EXT), -l%, $^)) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS)
-AM_SHARE = $(AM_V_CCLD) $(CXX) $(PLATFORM_SHARED_LDFLAGS)$@ -L. $(patsubst lib%.$(PLATFORM_SHARED_EXT), -l%, $^) $(EXEC_LDFLAGS) $(LDFLAGS) -o $@
+AM_SHARE = $(AM_V_CCLD) $(CXX) $(PLATFORM_SHARED_LDFLAGS)$@ -L. $(patsubst lib%.$(PLATFORM_SHARED_EXT), -l%, $^) $(EXTRA_SHARED_LIB_LIB) $(EXEC_LDFLAGS) $(LDFLAGS) -o $@
ROCKSDB_PLUGIN_MKS = $(foreach plugin, $(ROCKSDB_PLUGINS), plugin/$(plugin)/*.mk)
include $(ROCKSDB_PLUGIN_MKS)
@@ -267,6 +611,8 @@ ROCKSDB_PLUGIN_JNI_NATIVE_SOURCES = $(foreach plugin, $(ROCKSDB_PLUGINS), $(fore
ALL_JNI_NATIVE_SOURCES = $(JNI_NATIVE_SOURCES) $(ROCKSDB_PLUGIN_JNI_NATIVE_SOURCES)
ROCKSDB_PLUGIN_JNI_CXX_INCLUDEFLAGS = $(foreach plugin, $(ROCKSDB_PLUGINS), -I./plugin/$(plugin))
+ALL_JNI_NATIVE_OBJECTS := $(patsubst %.cc, $(OBJ_DIR)/%.o, $(ALL_JNI_NATIVE_SOURCES))
+
ifneq ($(strip $(ROCKSDB_PLUGIN_PKGCONFIG_REQUIRES)),)
LDFLAGS := $(LDFLAGS) $(shell pkg-config --libs $(ROCKSDB_PLUGIN_PKGCONFIG_REQUIRES))
ifneq ($(.SHELLSTATUS),0)
@@ -298,7 +644,7 @@ $(info $(shell $(CXX) --version))
endif
missing_make_config_paths := $(shell \
- grep "\./\S*\|/\S*" -o $(CURDIR)/make_config.mk | \
+ egrep "\.+/\S*|([a-z_]*)/\S*" -o $(CURDIR)/make_config.mk | \
while read path; \
do [ -e $$path ] || echo $$path; \
done | sort | uniq | grep -v "/DOES/NOT/EXIST")
@@ -309,8 +655,10 @@ $(foreach path, $(missing_make_config_paths), \
ifeq ($(PLATFORM), OS_AIX)
# no debug info
else ifneq ($(PLATFORM), IOS)
-CFLAGS += -g
-CXXFLAGS += -g
+# default disable dwarf
+DBG_DWARF ?=
+CFLAGS += ${DBG_DWARF} -g3
+CXXFLAGS += ${DBG_DWARF} -g3
else
# no debug info for IOS, that will make our library big
OPT += -DNDEBUG
@@ -337,12 +685,20 @@ ifneq ($(MACHINE), arm64)
# linking with jemalloc (as it won't be arm64-compatible) and remove some other options
# set during platform detection
DISABLE_JEMALLOC=1
-PLATFORM_CCFLAGS := $(filter-out -march=native, $(PLATFORM_CCFLAGS))
-PLATFORM_CXXFLAGS := $(filter-out -march=native, $(PLATFORM_CXXFLAGS))
+PLATFORM_CCFLAGS := $(filter-out -march=native -DHAVE_SSE42 -DHAVE_AVX2, $(PLATFORM_CCFLAGS))
+PLATFORM_CXXFLAGS := $(filter-out -march=native -DHAVE_SSE42 -DHAVE_AVX2, $(PLATFORM_CXXFLAGS))
endif
endif
endif
+ifeq (${WITH_BMI2},1)
+ CPU_ARCH ?= -march=haswell
+endif
+ifdef CPU_ARCH
+ PLATFORM_CCFLAGS := ${CPU_ARCH} $(filter-out -march=native -DHAVE_AVX2, $(PLATFORM_CCFLAGS))
+ PLATFORM_CXXFLAGS := ${CPU_ARCH} $(filter-out -march=native -DHAVE_AVX2, $(PLATFORM_CXXFLAGS))
+endif
+
# ASAN doesn't work well with jemalloc. If we're compiling with ASAN, we should use regular malloc.
ifdef COMPILE_WITH_ASAN
DISABLE_JEMALLOC=1
@@ -550,6 +906,9 @@ ifndef DISABLE_WARNING_AS_ERROR
WARNING_FLAGS += -Werror
endif
+# topling specific WARNING_FLAGS
+WARNING_FLAGS := -Wall -Wno-shadow
+WARNING_FLAGS += -Wno-deprecated-builtins
ifdef LUA_PATH
@@ -582,6 +941,7 @@ ifeq ($(NO_THREEWAY_CRC32C), 1)
endif
CFLAGS += $(C_WARNING_FLAGS) $(WARNING_FLAGS) -I. -I./include $(PLATFORM_CCFLAGS) $(OPT)
+CXXFLAGS += -Isideplugin/rockside/src
CXXFLAGS += $(WARNING_FLAGS) -I. -I./include $(PLATFORM_CXXFLAGS) $(OPT) -Woverloaded-virtual -Wnon-virtual-dtor -Wno-missing-field-initializers
# Allow offsetof to work on non-standard layout types. Some compiler could
@@ -591,10 +951,11 @@ CXXFLAGS += -Wno-invalid-offsetof
LDFLAGS += $(PLATFORM_LDFLAGS)
-LIB_OBJECTS = $(patsubst %.cc, $(OBJ_DIR)/%.o, $(LIB_SOURCES))
+LIB_OBJECTS := $(patsubst %.cc, $(OBJ_DIR)/%.o, $(LIB_SOURCES))
+LIB_OBJECTS := $(patsubst %.cpp,$(OBJ_DIR)/%.o, $(LIB_OBJECTS))
LIB_OBJECTS += $(patsubst %.cc, $(OBJ_DIR)/%.o, $(ROCKSDB_PLUGIN_SOURCES))
-ifeq ($(HAVE_POWER8),1)
LIB_OBJECTS += $(patsubst %.c, $(OBJ_DIR)/%.o, $(LIB_SOURCES_C))
+ifeq ($(HAVE_POWER8),1)
LIB_OBJECTS += $(patsubst %.S, $(OBJ_DIR)/%.o, $(LIB_SOURCES_ASM))
endif
@@ -605,6 +966,9 @@ endif
# range_tree is not compatible with non GNU libc on ppc64
# see https://jira.percona.com/browse/PS-7559
ifneq ($(PPC_LIBC_IS_GNU),0)
+ # topling: should move this line above and delete LIB_OBJECTS += .., add here for min-diff principle
+ # add to LIB_SOURCES to generate *.cc.d dependency rules
+ LIB_SOURCES += ${RANGE_TREE_SOURCES}
LIB_OBJECTS += $(patsubst %.cc, $(OBJ_DIR)/%.o, $(RANGE_TREE_SOURCES))
endif
@@ -634,6 +998,13 @@ PLUGIN_TESTS = $(patsubst %.cc, %, $(notdir $(ROCKSDB_PLUGIN_TESTS)))
TESTS = $(patsubst %.cc, %, $(notdir $(TEST_MAIN_SOURCES)))
TESTS += $(patsubst %.c, %, $(notdir $(TEST_MAIN_SOURCES_C)))
TESTS += $(PLUGIN_TESTS)
+ifeq (${MAKE_UNIT_TEST},1)
+ ifeq (cspp,$(patsubst cspp:%,cspp,${DefaultWBWIFactory}))
+ # cspp WBWI does not support txn with ts(timestamp)
+ $(warning "test with CSPP_WBWI, skip write_committed_transaction_ts_test")
+ TESTS := $(filter-out write_committed_transaction_ts_test,${TESTS})
+ endif
+endif
# `make check-headers` to very that each header file includes its own
# dependencies
@@ -749,14 +1120,24 @@ MICROBENCHS = $(patsubst %.cc, %, $(notdir $(MICROBENCH_SOURCES)))
ifeq ($(LIBNAME),)
LIBNAME=librocksdb
# we should only run rocksdb in production with DEBUG_LEVEL 0
-ifneq ($(DEBUG_LEVEL),0)
+ifeq ($(DEBUG_LEVEL),2)
LIBDEBUG=_debug
+ ifeq (${MAKE_UNIT_TEST},1)
+ LIBDEBUG=_debug_ut
+ endif
+endif
+ifeq ($(DEBUG_LEVEL),1)
+ LIBDEBUG=_debug_1
+ ifeq (${MAKE_UNIT_TEST},1)
+ LIBDEBUG=_debug_ut_1
+ endif
endif
endif
STATIC_LIBRARY = ${LIBNAME}$(LIBDEBUG).a
STATIC_TEST_LIBRARY = ${LIBNAME}_test$(LIBDEBUG).a
STATIC_TOOLS_LIBRARY = ${LIBNAME}_tools$(LIBDEBUG).a
STATIC_STRESS_LIBRARY = ${LIBNAME}_stress$(LIBDEBUG).a
+#$(error LIBDEBUG = ${LIBDEBUG} PLATFORM_SHARED_VERSIONED=${PLATFORM_SHARED_VERSIONED})
ALL_STATIC_LIBS = $(STATIC_LIBRARY) $(STATIC_TEST_LIBRARY) $(STATIC_TOOLS_LIBRARY) $(STATIC_STRESS_LIBRARY)
@@ -821,8 +1202,8 @@ default: all
#-----------------------------------------------
ifneq ($(PLATFORM_SHARED_EXT),)
-ifneq ($(PLATFORM_SHARED_VERSIONED),true)
SHARED1 = ${LIBNAME}$(LIBDEBUG).$(PLATFORM_SHARED_EXT)
+ifneq ($(PLATFORM_SHARED_VERSIONED),true)
SHARED2 = $(SHARED1)
SHARED3 = $(SHARED1)
SHARED4 = $(SHARED1)
@@ -831,7 +1212,6 @@ else
SHARED_MAJOR = $(ROCKSDB_MAJOR)
SHARED_MINOR = $(ROCKSDB_MINOR)
SHARED_PATCH = $(ROCKSDB_PATCH)
-SHARED1 = ${LIBNAME}.$(PLATFORM_SHARED_EXT)
ifeq ($(PLATFORM), OS_MACOSX)
SHARED_OSX = $(LIBNAME)$(LIBDEBUG).$(SHARED_MAJOR)
SHARED2 = $(SHARED_OSX).$(PLATFORM_SHARED_EXT)
@@ -852,7 +1232,7 @@ $(SHARED3): $(SHARED4)
endif # PLATFORM_SHARED_VERSIONED
$(SHARED4): $(LIB_OBJECTS)
- $(AM_V_CCLD) $(CXX) $(PLATFORM_SHARED_LDFLAGS)$(SHARED3) $(LIB_OBJECTS) $(LDFLAGS) -o $@
+ $(AM_V_CCLD) $(CXX) $(PLATFORM_SHARED_LDFLAGS)$(SHARED3) $(LIB_OBJECTS) $(EXTRA_SHARED_LIB_LIB) $(LDFLAGS) -o $@
endif # PLATFORM_SHARED_EXT
.PHONY: check clean coverage ldb_tests package dbg gen-pc build_size \
@@ -866,7 +1246,11 @@ all_but_some_tests: $(LIBRARY) $(BENCHMARKS) tools tools_lib test_libs $(ROCKSDB
static_lib: $(STATIC_LIBRARY)
+ifdef TOPLING_DCOMPACT_GIT_VER_SRC
+shared_lib: $(SHARED) dcompact_worker
+else
shared_lib: $(SHARED)
+endif
stress_lib: $(STRESS_LIBRARY)
@@ -1226,6 +1610,9 @@ clean-rocks:
rm -f ${LIBNAME}*.so* ${LIBNAME}*.a
rm -f $(BENCHMARKS) $(TOOLS) $(TESTS) $(PARALLEL_TEST) $(MICROBENCHS)
rm -rf $(CLEAN_FILES) ios-x86 ios-arm scan_build_report
+ rm -rf build build-ut
+ rm -rf sideplugin/topling-dcompact/tools/dcompact/build
+ +$(MAKE) -C ${TOPLING_CORE_DIR} clean
$(FIND) . -name "*.[oda]" -exec rm -f {} \;
$(FIND) . -type f \( -name "*.gcda" -o -name "*.gcno" \) -exec rm -f {} \;
@@ -1306,6 +1693,14 @@ librocksdb_env_basic_test.a: $(OBJ_DIR)/env/env_basic_test.o $(LIB_OBJECTS) $(TE
db_bench: $(OBJ_DIR)/tools/db_bench.o $(BENCH_OBJECTS) $(TESTUTIL) $(LIBRARY)
$(AM_LINK)
+ifeq (${DEBUG_LEVEL},2)
+db_bench_dbg: $(OBJ_DIR)/tools/db_bench.o $(BENCH_OBJECTS) $(TESTUTIL) $(LIBRARY)
+ $(AM_LINK)
+endif
+ifeq (${DEBUG_LEVEL},0)
+db_bench_rls: $(OBJ_DIR)/tools/db_bench.o $(BENCH_OBJECTS) $(TESTUTIL) $(LIBRARY)
+ $(AM_LINK)
+endif
trace_analyzer: $(OBJ_DIR)/tools/trace_analyzer.o $(ANALYZE_OBJECTS) $(TOOLS_LIBRARY) $(LIBRARY)
$(AM_LINK)
@@ -1348,7 +1743,7 @@ $(foreach test, $(ROCKSDB_PLUGIN_TESTS), $(eval $(call MakeTestRule, $(test))))
arena_test: $(OBJ_DIR)/memory/arena_test.o $(TEST_LIBRARY) $(LIBRARY)
$(AM_LINK)
-memory_allocator_test: memory/memory_allocator_test.o $(TEST_LIBRARY) $(LIBRARY)
+memory_allocator_test: $(OBJ_DIR)/memory/memory_allocator_test.o $(TEST_LIBRARY) $(LIBRARY)
$(AM_LINK)
autovector_test: $(OBJ_DIR)/util/autovector_test.o $(TEST_LIBRARY) $(LIBRARY)
@@ -1450,7 +1845,7 @@ db_wide_basic_test: $(OBJ_DIR)/db/wide/db_wide_basic_test.o $(TEST_LIBRARY) $(LI
db_with_timestamp_basic_test: $(OBJ_DIR)/db/db_with_timestamp_basic_test.o $(TEST_LIBRARY) $(LIBRARY)
$(AM_LINK)
-db_with_timestamp_compaction_test: db/db_with_timestamp_compaction_test.o $(TEST_LIBRARY) $(LIBRARY)
+db_with_timestamp_compaction_test: $(OBJ_DIR)/db/db_with_timestamp_compaction_test.o $(TEST_LIBRARY) $(LIBRARY)
$(AM_LINK)
db_encryption_test: $(OBJ_DIR)/db/db_encryption_test.o $(TEST_LIBRARY) $(LIBRARY)
@@ -1666,7 +2061,7 @@ random_access_file_reader_test: $(OBJ_DIR)/file/random_access_file_reader_test.o
file_reader_writer_test: $(OBJ_DIR)/util/file_reader_writer_test.o $(TEST_LIBRARY) $(LIBRARY)
$(AM_LINK)
-block_based_table_reader_test: table/block_based/block_based_table_reader_test.o $(TEST_LIBRARY) $(LIBRARY)
+block_based_table_reader_test: $(OBJ_DIR)/table/block_based/block_based_table_reader_test.o $(TEST_LIBRARY) $(LIBRARY)
$(AM_LINK)
full_filter_block_test: $(OBJ_DIR)/table/block_based/full_filter_block_test.o $(TEST_LIBRARY) $(LIBRARY)
@@ -1684,7 +2079,7 @@ cleanable_test: $(OBJ_DIR)/table/cleanable_test.o $(TEST_LIBRARY) $(LIBRARY)
table_test: $(OBJ_DIR)/table/table_test.o $(TEST_LIBRARY) $(LIBRARY)
$(AM_LINK)
-block_fetcher_test: table/block_fetcher_test.o $(TEST_LIBRARY) $(LIBRARY)
+block_fetcher_test: $(OBJ_DIR)/table/block_fetcher_test.o $(TEST_LIBRARY) $(LIBRARY)
$(AM_LINK)
block_test: $(OBJ_DIR)/table/block_based/block_test.o $(TEST_LIBRARY) $(LIBRARY)
@@ -1774,10 +2169,10 @@ thread_list_test: $(OBJ_DIR)/util/thread_list_test.o $(TEST_LIBRARY) $(LIBRARY)
compact_files_test: $(OBJ_DIR)/db/compact_files_test.o $(TEST_LIBRARY) $(LIBRARY)
$(AM_LINK)
-configurable_test: options/configurable_test.o $(TEST_LIBRARY) $(LIBRARY)
+configurable_test: $(OBJ_DIR)/options/configurable_test.o $(TEST_LIBRARY) $(LIBRARY)
$(AM_LINK)
-customizable_test: options/customizable_test.o $(TEST_LIBRARY) $(LIBRARY)
+customizable_test: $(OBJ_DIR)/options/customizable_test.o $(TEST_LIBRARY) $(LIBRARY)
$(AM_LINK)
options_test: $(OBJ_DIR)/options/options_test.o $(TEST_LIBRARY) $(LIBRARY)
@@ -1831,7 +2226,7 @@ write_callback_test: $(OBJ_DIR)/db/write_callback_test.o $(TEST_LIBRARY) $(LIBRA
heap_test: $(OBJ_DIR)/util/heap_test.o $(TEST_LIBRARY) $(LIBRARY)
$(AM_LINK)
-point_lock_manager_test: utilities/transactions/lock/point/point_lock_manager_test.o $(TEST_LIBRARY) $(LIBRARY)
+point_lock_manager_test: $(OBJ_DIR)/utilities/transactions/lock/point/point_lock_manager_test.o $(TEST_LIBRARY) $(LIBRARY)
$(AM_LINK)
transaction_test: $(OBJ_DIR)/utilities/transactions/transaction_test.o $(TEST_LIBRARY) $(LIBRARY)
@@ -1900,7 +2295,7 @@ blob_db_test: $(OBJ_DIR)/utilities/blob_db/blob_db_test.o $(TEST_LIBRARY) $(LIBR
repeatable_thread_test: $(OBJ_DIR)/util/repeatable_thread_test.o $(TEST_LIBRARY) $(LIBRARY)
$(AM_LINK)
-range_locking_test: utilities/transactions/lock/range/range_locking_test.o $(TEST_LIBRARY) $(LIBRARY)
+range_locking_test: $(OBJ_DIR)/utilities/transactions/lock/range/range_locking_test.o $(TEST_LIBRARY) $(LIBRARY)
$(AM_LINK)
range_tombstone_fragmenter_test: $(OBJ_DIR)/db/range_tombstone_fragmenter_test.o $(TEST_LIBRARY) $(LIBRARY)
@@ -1964,6 +2359,57 @@ io_tracer_parser_test: $(OBJ_DIR)/tools/io_tracer_parser_test.o $(OBJ_DIR)/tools
$(AM_LINK)
io_tracer_parser: $(OBJ_DIR)/tools/io_tracer_parser.o $(TOOLS_LIBRARY) $(LIBRARY)
+#--------------------------------------------------
+ifndef ROCKSDB_USE_LIBRADOS
+ AUTO_ALL_EXCLUDE_SRC += utilities/env_librados_test.cc
+ AUTO_ALL_EXCLUDE_SRC += utilities/env_mirror_test.cc
+endif
+
+AUTO_ALL_TESTS_SRC := $(shell find * -name '*_test.cc' -not -path 'java/*' -not -path '*/3rdparty/*') ${EXTRA_TESTS_SRC}
+AUTO_ALL_TESTS_SRC := $(filter-out ${AUTO_ALL_EXCLUDE_SRC},${AUTO_ALL_TESTS_SRC})
+AUTO_ALL_TESTS_OBJ := $(addprefix $(OBJ_DIR)/,$(AUTO_ALL_TESTS_SRC:%.cc=%.o))
+AUTO_ALL_TESTS_EXE := $(AUTO_ALL_TESTS_OBJ:%.o=%)
+
+define LN_TEST_TARGET
+t${DEBUG_LEVEL}/${1}: ${2}
+ mkdir -p $(dir $$@) && ln -sf `realpath ${2}` $$@
+
+endef
+#intentional one blank line above
+
+.PHONY: auto_all_tests
+auto_all_tests: ${AUTO_ALL_TESTS_EXE}
+
+$(OBJ_DIR)/tools/%_test: $(OBJ_DIR)/tools/%_test.o \
+ ${TOOLS_LIBRARY} $(TEST_LIBRARY) $(LIBRARY)
+ $(AM_LINK)
+
+$(OBJ_DIR)/%_test: $(OBJ_DIR)/%_test.o $(TEST_LIBRARY) $(LIBRARY)
+ $(AM_LINK)
+
+$(eval $(foreach test,${AUTO_ALL_TESTS_EXE},$(call LN_TEST_TARGET,$(notdir ${test}),${test})))
+
+$(OBJ_DIR)/tools/db_bench_tool_test : \
+$(OBJ_DIR)/tools/db_bench_tool_test.o \
+ ${BENCH_OBJECTS} $(TEST_LIBRARY) $(LIBRARY)
+ $(AM_LINK)
+
+$(OBJ_DIR)/file/prefetch_test : \
+$(OBJ_DIR)/file/prefetch_test.o \
+$(OBJ_DIR)/tools/io_tracer_parser_tool.o $(TEST_LIBRARY) $(LIBRARY)
+ $(AM_LINK)
+
+$(OBJ_DIR)/tools/trace_analyzer_test : \
+$(OBJ_DIR)/tools/trace_analyzer_test.o \
+ ${ANALYZE_OBJECTS} ${TOOLS_LIBRARY} $(TEST_LIBRARY) $(LIBRARY)
+ $(AM_LINK)
+
+$(OBJ_DIR)/tools/block_cache_analyzer/block_cache_trace_analyzer_test : \
+$(OBJ_DIR)/tools/block_cache_analyzer/block_cache_trace_analyzer_test.o \
+$(OBJ_DIR)/tools/block_cache_analyzer/block_cache_trace_analyzer.o $(TEST_LIBRARY) $(LIBRARY)
+ $(AM_LINK)
+
+$(OBJ_DIR)/%: $(OBJ_DIR)/%.o $(TEST_LIBRARY) $(LIBRARY)
$(AM_LINK)
db_blob_corruption_test: $(OBJ_DIR)/db/blob/db_blob_corruption_test.o $(TEST_LIBRARY) $(LIBRARY)
@@ -2018,22 +2464,48 @@ install-headers: gen-pc
install -d $(DESTDIR)/$(PREFIX)/include/rocksdb/`dirname $$header`; \
install -C -m 644 $$header $(DESTDIR)/$(PREFIX)/include/rocksdb/$$header; \
done
+ install -d $(DESTDIR)/$(PREFIX)/include/topling
+ install -C -m 644 sideplugin/rockside/src/topling/json.h $(DESTDIR)/$(PREFIX)/include/topling
+ install -C -m 644 sideplugin/rockside/src/topling/json_fwd.h $(DESTDIR)/$(PREFIX)/include/topling
+ install -C -m 644 sideplugin/rockside/src/topling/builtin_table_factory.h $(DESTDIR)/$(PREFIX)/include/topling
+ install -C -m 644 sideplugin/rockside/src/topling/side_plugin_repo.h $(DESTDIR)/$(PREFIX)/include/topling
+ install -C -m 644 sideplugin/rockside/src/topling/side_plugin_factory.h $(DESTDIR)/$(PREFIX)/include/topling
+ install -d $(DESTDIR)/$(PREFIX)/include/terark
+ install -d $(DESTDIR)/$(PREFIX)/include/terark/io
+ install -d $(DESTDIR)/$(PREFIX)/include/terark/succinct
+ install -d $(DESTDIR)/$(PREFIX)/include/terark/thread
+ install -d $(DESTDIR)/$(PREFIX)/include/terark/util
+ install -d $(DESTDIR)/$(PREFIX)/include/terark/fsa
+ install -d $(DESTDIR)/$(PREFIX)/include/terark/fsa/ppi
+ install -d $(DESTDIR)/$(PREFIX)/include/terark/zbs
+ install -C -m 644 ${TOPLING_CORE_DIR}/src/terark/*.hpp $(DESTDIR)/$(PREFIX)/include/terark
+ install -C -m 644 ${TOPLING_CORE_DIR}/src/terark/io/*.hpp $(DESTDIR)/$(PREFIX)/include/terark/io
+ install -C -m 644 ${TOPLING_CORE_DIR}/src/terark/succinct/*.hpp $(DESTDIR)/$(PREFIX)/include/terark/succinct
+ install -C -m 644 ${TOPLING_CORE_DIR}/src/terark/thread/*.hpp $(DESTDIR)/$(PREFIX)/include/terark/thread
+ install -C -m 644 ${TOPLING_CORE_DIR}/src/terark/util/*.hpp $(DESTDIR)/$(PREFIX)/include/terark/util
+ install -C -m 644 ${TOPLING_CORE_DIR}/src/terark/fsa/*.hpp $(DESTDIR)/$(PREFIX)/include/terark/fsa
+ install -C -m 644 ${TOPLING_CORE_DIR}/src/terark/fsa/*.inl $(DESTDIR)/$(PREFIX)/include/terark/fsa
+ install -C -m 644 ${TOPLING_CORE_DIR}/src/terark/fsa/ppi/*.hpp $(DESTDIR)/$(PREFIX)/include/terark/fsa/ppi
+ install -C -m 644 ${TOPLING_CORE_DIR}/src/terark/zbs/*.hpp $(DESTDIR)/$(PREFIX)/include/terark/zbs
+ cp -ar ${TOPLING_CORE_DIR}/boost-include/boost $(DESTDIR)/$(PREFIX)/include
install -C -m 644 rocksdb.pc $(INSTALL_LIBDIR)/pkgconfig/rocksdb.pc
-install-static: install-headers $(LIBRARY)
+install-static: install-headers $(LIBRARY) static_lib
install -d $(INSTALL_LIBDIR)
install -C -m 755 $(LIBRARY) $(INSTALL_LIBDIR)
+ cp -a ${TOPLING_CORE_DIR}/${BUILD_ROOT}/lib_static/* $(INSTALL_LIBDIR)
-install-shared: install-headers $(SHARED4)
+install-shared: install-headers $(SHARED4) shared_lib
install -d $(INSTALL_LIBDIR)
install -C -m 755 $(SHARED4) $(INSTALL_LIBDIR)
ln -fs $(SHARED4) $(INSTALL_LIBDIR)/$(SHARED3)
ln -fs $(SHARED4) $(INSTALL_LIBDIR)/$(SHARED2)
ln -fs $(SHARED4) $(INSTALL_LIBDIR)/$(SHARED1)
+ cp -a ${TOPLING_CORE_DIR}/${BUILD_ROOT}/lib_shared/* $(INSTALL_LIBDIR)
+ mkdir -p $(DESTDIR)$(PREFIX)/bin
+ cp -a sideplugin/topling-dcompact/tools/dcompact/${OBJ_DIR}/*.exe $(DESTDIR)$(PREFIX)/bin
-# install static by default + install shared if it exists
-install: install-static
- [ -e $(SHARED4) ] && $(MAKE) install-shared || :
+install: install-${LIB_MODE}
# Generate the pkg-config file
gen-pc:
@@ -2046,7 +2518,7 @@ gen-pc:
-echo 'Description: An embeddable persistent key-value store for fast storage' >> rocksdb.pc
-echo Version: $(shell ./build_tools/version.sh full) >> rocksdb.pc
-echo 'Libs: -L$${libdir} $(EXEC_LDFLAGS) -lrocksdb' >> rocksdb.pc
- -echo 'Libs.private: $(PLATFORM_LDFLAGS)' >> rocksdb.pc
+ -echo 'Libs.private: -lterark-zbs-r -lterark-fsa-r -lterark-core-r $(PLATFORM_LDFLAGS)' >> rocksdb.pc
-echo 'Cflags: -I$${includedir} $(PLATFORM_CXXFLAGS)' >> rocksdb.pc
-echo 'Requires: $(subst ",,$(ROCKSDB_PLUGIN_PKGCONFIG_REQUIRES))' >> rocksdb.pc
@@ -2399,18 +2871,31 @@ rocksdbjavastaticnexusbundlejar: rocksdbjavageneratepom
jl/%.o: %.cc
$(AM_V_CC)mkdir -p $(@D) && $(CXX) $(CXXFLAGS) -fPIC -c $< -o $@ $(COVERAGEFLAGS)
-rocksdbjava: $(LIB_OBJECTS)
+${ALL_JNI_NATIVE_OBJECTS}: CXXFLAGS += -I./java/. -I./java/rocksjni $(JAVA_INCLUDE) $(ROCKSDB_PLUGIN_JNI_CXX_INCLUDEFLAGS)
+${ALL_JNI_NATIVE_OBJECTS}: rocksdbjava-header
+rocksdbjava: $(LIB_OBJECTS) $(ALL_JNI_NATIVE_OBJECTS)
ifeq ($(JAVA_HOME),)
$(error JAVA_HOME is not set)
endif
- $(AM_V_GEN)cd java; $(MAKE) javalib;
$(AM_V_at)rm -f ./java/target/$(ROCKSDBJNILIB)
- $(AM_V_at)$(CXX) $(CXXFLAGS) -I./java/. -I./java/rocksjni $(JAVA_INCLUDE) $(ROCKSDB_PLUGIN_JNI_CXX_INCLUDEFLAGS) -shared -fPIC -o ./java/target/$(ROCKSDBJNILIB) $(ALL_JNI_NATIVE_SOURCES) $(LIB_OBJECTS) $(JAVA_LDFLAGS) $(COVERAGEFLAGS)
+ $(AM_V_at)$(CXX) $(CXXFLAGS) -shared -fPIC -o ./java/target/$(ROCKSDBJNILIB) $(ALL_JNI_NATIVE_OBJECTS) $(LIB_OBJECTS) $(JAVA_LDFLAGS) $(LDFLAGS)
+ $(AM_V_at)cp -a ${TOPLING_CORE_DIR}/${BUILD_ROOT}/lib_shared/*${COMPILER}*-r.so java/target
+ $(AM_V_at)cp -a sideplugin/rockside/src/topling/web/{style.css,index.html} java/target
+ifeq ($(STRIP_DEBUG_INFO),1)
+ $(AM_V_at)strip java/target/*.so
+endif
$(AM_V_at)cd java; $(JAR_CMD) -cf target/$(ROCKSDB_JAR) HISTORY*.md
- $(AM_V_at)cd java/target; $(JAR_CMD) -uf $(ROCKSDB_JAR) $(ROCKSDBJNILIB)
+ $(AM_V_at)cd java/target; $(JAR_CMD) -uf $(ROCKSDB_JAR) *.so
+ $(AM_V_at)cd java/target; $(JAR_CMD) -uf $(ROCKSDB_JAR) style.css index.html
$(AM_V_at)cd java/target/classes; $(JAR_CMD) -uf ../$(ROCKSDB_JAR) org/rocksdb/*.class org/rocksdb/util/*.class
$(AM_V_at)openssl sha1 java/target/$(ROCKSDB_JAR) | sed 's/.*= \([0-9a-f]*\)/\1/' > java/target/$(ROCKSDB_JAR).sha1
+rocksdbjava-header:
+ifeq ($(JAVA_HOME),)
+ $(error JAVA_HOME is not set)
+endif
+ $(AM_V_GEN)cd java; $(MAKE) javalib;
+
jclean:
cd java;$(MAKE) clean;
@@ -2531,7 +3016,16 @@ $(OBJ_DIR)/%.o: %.cpp
$(AM_V_CC)mkdir -p $(@D) && $(CXX) $(CXXFLAGS) -c $< -o $@ $(COVERAGEFLAGS)
$(OBJ_DIR)/%.o: %.c
- $(AM_V_CC)$(CC) $(CFLAGS) -c $< -o $@
+ $(AM_V_CC)mkdir -p $(@D) && $(CC) $(CFLAGS) -c $< -o $@
+
+$(OBJ_DIR)/%.s: %.cc
+ $(AM_V_CC)mkdir -p $(@D) && $(CXX) $(CXXFLAGS) -Wa,-adhln -fverbose-asm -masm=intel -S $< -o $@ $(COVERAGEFLAGS)
+
+$(OBJ_DIR)/%.s: %.cpp
+ $(AM_V_CC)mkdir -p $(@D) && $(CXX) $(CXXFLAGS) -fverbose-asm -masm=intel -S $< -o $@ $(COVERAGEFLAGS)
+
+$(OBJ_DIR)/%.s: %.c
+ $(AM_V_CC)mkdir -p $(@D) && $(CC) $(CFLAGS) -fverbose-asm -masm=intel -S $< -o $@
endif
# ---------------------------------------------------------------------------
@@ -2539,8 +3033,9 @@ endif
# ---------------------------------------------------------------------------
# If skip dependencies is ON, skip including the dep files
ifneq ($(SKIP_DEPENDS), 1)
-DEPFILES = $(patsubst %.cc, $(OBJ_DIR)/%.cc.d, $(ALL_SOURCES))
-DEPFILES+ = $(patsubst %.c, $(OBJ_DIR)/%.c.d, $(LIB_SOURCES_C) $(TEST_MAIN_SOURCES_C))
+DEPFILES := $(patsubst %.cc, $(OBJ_DIR)/%.cc.d, $(ALL_SOURCES))
+DEPFILES := $(patsubst %.cpp,$(OBJ_DIR)/%.cpp.d,$(DEPFILES))
+DEPFILES += $(patsubst %.c, $(OBJ_DIR)/%.c.d, $(LIB_SOURCES_C) $(TEST_MAIN_SOURCES_C))
ifeq ($(USE_FOLLY_LITE),1)
DEPFILES +=$(patsubst %.cpp, $(OBJ_DIR)/%.cpp.d, $(FOLLY_SOURCES))
endif
@@ -2554,12 +3049,12 @@ endif
$(OBJ_DIR)/%.cc.d: %.cc
@mkdir -p $(@D) && $(CXX) $(CXXFLAGS) $(PLATFORM_SHARED_CFLAGS) \
-MM -MT'$@' -MT'$(<:.cc=.o)' -MT'$(<:%.cc=$(OBJ_DIR)/%.o)' \
- "$<" -o '$@'
+ "$<" -o '$@'
$(OBJ_DIR)/%.cpp.d: %.cpp
@mkdir -p $(@D) && $(CXX) $(CXXFLAGS) $(PLATFORM_SHARED_CFLAGS) \
-MM -MT'$@' -MT'$(<:.cpp=.o)' -MT'$(<:%.cpp=$(OBJ_DIR)/%.o)' \
- "$<" -o '$@'
+ "$<" -o '$@'
ifeq ($(HAVE_POWER8),1)
DEPFILES_C = $(patsubst %.c, $(OBJ_DIR)/%.c.d, $(LIB_SOURCES_C))
@@ -2587,6 +3082,70 @@ build_subset_tests: $(ROCKSDBTESTS_SUBSET)
list_all_tests:
echo "$(ROCKSDBTESTS_SUBSET)"
+TOPLING_ZBS_TARGET := ${BUILD_ROOT}/lib_shared/libterark-zbs-${COMPILER}-${BUILD_TYPE_SIG}.${PLATFORM_SHARED_EXT}
+${SHARED4}: ${TOPLING_CORE_DIR}/${TOPLING_ZBS_TARGET}
+${TOPLING_CORE_DIR}/${TOPLING_ZBS_TARGET}: CXXFLAGS =
+${TOPLING_CORE_DIR}/${TOPLING_ZBS_TARGET}: LDFLAGS =
+${TOPLING_CORE_DIR}/${TOPLING_ZBS_TARGET}:
+ +make -C ${TOPLING_CORE_DIR} ${TOPLING_ZBS_TARGET}
+
+${STATIC_LIBRARY}: ${BUILD_ROOT}/lib_shared/libterark-zbs-${COMPILER}-${BUILD_TYPE_SIG}.a
+${BUILD_ROOT}/lib_shared/libterark-zbs-${COMPILER}-${BUILD_TYPE_SIG}.a:
+ +make -C ${TOPLING_CORE_DIR} core fsa zbs
+
+ifeq (${WITH_TOPLING_ROCKS},1)
+ifneq (,$(wildcard sideplugin/topling-rocks))
+sideplugin/topling-rocks/${TOPLING_ROCKS_GIT_VER_SRC}: \
+ $(shell find sideplugin/topling-rocks/{src,tools} -name '*.cc' -o -name '*.h')
+ +make -C sideplugin/topling-rocks ${TOPLING_ROCKS_GIT_VER_SRC}
+endif
+endif
+
+ifneq (,$(wildcard sideplugin/cspp-memtable))
+sideplugin/cspp-memtable/${CSPP_MEMTABLE_GIT_VER_SRC}: \
+ sideplugin/cspp-memtable/cspp_memtable.cc \
+ sideplugin/cspp-memtable/Makefile
+ +make -C sideplugin/cspp-memtable ${CSPP_MEMTABLE_GIT_VER_SRC}
+endif
+ifneq (,$(wildcard sideplugin/cspp-wbwi))
+sideplugin/cspp-wbwi/${CSPP_WBWI_GIT_VER_SRC}: \
+ sideplugin/cspp-wbwi/cspp_wbwi.cc \
+ sideplugin/cspp-wbwi/Makefile
+ +make -C sideplugin/cspp-wbwi ${CSPP_WBWI_GIT_VER_SRC}
+endif
+ifneq (,$(wildcard sideplugin/topling-sst/src/table))
+sideplugin/topling-sst/${TOPLING_SST_GIT_VER_SRC}: \
+ $(wildcard sideplugin/topling-sst/src/table/*.h) \
+ $(wildcard sideplugin/topling-sst/src/table/*.cc) \
+ sideplugin/topling-sst/Makefile
+ +make -C sideplugin/topling-sst ${TOPLING_SST_GIT_VER_SRC}
+endif
+ifneq (,$(wildcard sideplugin/topling-zip_table_reader/src/table))
+sideplugin/topling-zip_table_reader/${TOPLING_ZIP_TABLE_READER_GIT_VER_SRC}: \
+ $(wildcard sideplugin/topling-zip_table_reader/src/table/*.h) \
+ $(wildcard sideplugin/topling-zip_table_reader/src/table/*.cc) \
+ sideplugin/topling-zip_table_reader/Makefile
+ +make -C sideplugin/topling-zip_table_reader ${TOPLING_ZIP_TABLE_READER_GIT_VER_SRC}
+endif
+ifneq (,$(wildcard sideplugin/topling-dcompact/src/dcompact))
+sideplugin/topling-dcompact/${TOPLING_DCOMPACT_GIT_VER_SRC}: \
+ $(wildcard sideplugin/topling-dcompact/src/dcompact/*.h) \
+ $(wildcard sideplugin/topling-dcompact/src/dcompact/*.cc) \
+ $(wildcard sideplugin/topling-dcompact/tools/dcompact/*.cpp) \
+ sideplugin/topling-dcompact/Makefile
+ +make -C sideplugin/topling-dcompact ${TOPLING_DCOMPACT_GIT_VER_SRC}
+.PHONY: dcompact_worker
+dcompact_worker: ${SHARED1}
+ifeq (${MAKE_UNIT_TEST},1)
+ @echo rocksdb unit test, skip dcompact_worker
+else
+ +make -C sideplugin/topling-dcompact/tools/dcompact ${OBJ_DIR}/dcompact_worker.exe CHECK_TERARK_FSA_LIB_UPDATE=0
+ cp -a sideplugin/topling-dcompact/tools/dcompact/${OBJ_DIR}/dcompact_worker.exe ${OBJ_DIR}
+endif
+endif
+
+${OBJ_DIR}/sideplugin/rockside/src/topling/web/civetweb.o: CFLAGS += -DUSE_ZLIB
+
# Remove the rules for which dependencies should not be generated and see if any are left.
#If so, include the dependencies; if not, do not include the dependency files
ROCKS_DEP_RULES=$(filter-out clean format check-format check-buck-targets check-headers check-sources jclean jtest package analyze tags rocksdbjavastatic% unity.% unity_test checkout_folly, $(MAKECMDGOALS))
diff --git a/README-zh_cn.md b/README-zh_cn.md
new file mode 100644
index 0000000000..c6d77a0f85
--- /dev/null
+++ b/README-zh_cn.md
@@ -0,0 +1,136 @@
+## ToplingDB: 一个外存上的持久化 Key-Value 存储引擎
+ToplingDB 由[北京拓扑岭科技有限公司](https://topling.cn)开发与维护,从 [RocksDB](https://github.com/facebook/rocksdb) 分叉而来,详情参考 [ToplingDB 分支名称约定](https://github.com/topling/toplingdb/wiki/ToplingDB-Branch-Name-Convention)。
+
+ToplingDB 的子模块 **[rockside](https://github.com/topling/rockside)** 是 ToplingDB 的入口,详情参考 **[SidePlugin wiki](https://github.com/topling/rockside/wiki)**。
+
+ToplingDB 兼容 RocksDB API 的同时,增加了很多非常重要的功能与改进:
+1. [SidePlugin](https://github.com/topling/rockside/wiki) 让用户可以通过 json/yaml 文件来定义 DB 配置
+1. [内嵌 Http](https://github.com/topling/rockside/wiki/WebView) 让用户可以通过 Web 查看几乎所有 DB 信息,这是 [SidePlugin](https://github.com/topling/rockside/wiki) 的一个子功能
+1. [内嵌 Http](https://github.com/topling/rockside/wiki/WebView) 让用户可以无需重启进程,[在线修改](https://github.com/topling/rockside/wiki/Online-Change-Options) 各种 db/cf 配置,包括修改 DB 元对象(例如 MemTabFactory, TableFactory, WriteBufferManager ...)
+1. 为提升性能和可扩展性而实施的很多重构与改进,例如 MemTable 的重构
+1. 对事务处理的改进,特别是 TransactionDB 中 Lock 的管理,热点代码有 5x 以上的性能提升
+1. MultiGet 中使用 fiber/coroutine + io_uring 实现了并发 IO,比 RocksDB 自身的异步 MultiGet 又快又简洁,相应的代码量要少 100 倍不止
+1. [去虚拟化](https://github.com/topling/rockside/wiki/Devirtualization-And-Key-Prefix-Cache-Principle),消除热点代码中的虚函数调用(主要是 Comparator),并且增加了 Key 前缀缓存,参考相应 [bechmarks](https://github.com/topling/rockside/wiki/Devirtualization-And-Key-Prefix-Cache-Benchmark)
+1. 点查和迭代器扫描中的 Zero Copy,对大 Value 效果尤其显著
+1. 将现存的 RocksDB 组件作为**内置插件**纳入 SidePlugin 体系,例如 Cache, Comparator, TableFactory, MemTableFactory...
+1. 内置 Prometheus 指标的支持,这是在[内嵌 Http](https://github.com/topling/rockside/wiki/WebView) 中实现的
+1. 修复了很多 RocksDB 的 bug,我们已将其中易于合并到 RocksDB 的很多修复与改进给上游 RocksDB 发了 [Pull Request](https://github.com/facebook/rocksdb/pulls?q=is%3Apr+author%3Arockeet)
+
+## ToplingDB 云原生数据库服务
+1. [MyTopling](https://github.com/topling/mytopling)(MySQL on ToplingDB), [阿里云上的托管 MyTopling](https://topling.cn/products/mytopling/)
+1. [Todis](https://github.com/topling/todis)(Redis on ToplingDB), [阿里云上的托管 Todis](https://topling.cn/products/todis-enterprise/)
+
+## ToplingDB 组件
+通过 SidePlugin 的实现机制,插件(组件)可以与 ToplingDB 的核心代码实现物理隔离
+1. 可以编译为一个单独的动态库,实现运行时动态加载
+1. 应用代码不需要为插件做任何改变,只需要修改 json/yaml 配置
+
+### git 仓库的目录结构
+```bash
+toplingdb
+ \__ sideplugin
+ \__ rockside (submodule , sideplugin core and framework)
+ \__ topling-zip (auto clone, zip and core lib)
+ \__ cspp-memtab (auto clone, sideplugin component)
+ \__ cspp-wbwi (auto clone, sideplugin component)
+ \__ topling-sst (auto clone, sideplugin component)
+ \__ topling-rocks (auto clone, sideplugin component)
+ \__ topling-zip_table_reader (auto clone, sideplugin component)
+ \__ topling-dcompact (auto clone, sideplugin component)
+ \_ tools/dcompact (dcompact-worker binary app)
+```
+ 仓库 | 权限 | 说明
+-------------- | ---------- | -----------
+[ToplingDB](https://github.com/topling/toplingdb) | public | 顶级仓库,分叉自 [RocksDB](https://github.com/facebook/rocksdb),增加了我们的改进与修复
+[rockside](https://github.com/topling/rockside) | public | ToplingDB 子模块,包含:
- SidePlugin 框架和内置插件
- 内嵌的 Http 服务和 Prometheus 指标
+[cspp-wbwi
(**W**rite**B**atch**W**ith**I**ndex)](https://github.com/topling/cspp-wbwi) | public | 使用 Topling CSPP Trie 实现的 **CSPP_WBWI** 相比 rocksdb SkipList WBWI 最多有 20 倍以上的性能提升
+[cspp-memtable](https://github.com/topling/cspp-memtable) | public | (**CSPP** is **C**rash **S**afe **P**arallel **P**atricia trie) MemTab, 相比 SkipList:内存用量更低,单线程性能提升 7 倍,多线程线性提升,可[直接转化为 SST](https://github.com/topling/cspp-memtable#%E4%BA%8Cmemtable-%E7%9B%B4%E6%8E%A5%E8%BD%AC%E5%8C%96%E6%88%90-sst)
+[topling-sst](https://github.com/topling/topling-sst) | public | 1. [SingleFastTable](https://github.com/topling/rockside/wiki/SingleFastTable)(主要用于 L0 和 L1)
2. VecAutoSortTable(主要用于 MyTopling bulk_load).
3. 已弃用:[ToplingFastTable](https://github.com/topling/rockside/wiki/ToplingFastTable), CSPPAutoSortTable
+[topling-dcompact](https://github.com/topling/topling-dcompact) | public | 分布式 Compact 与通用的 dcompact_worker 程序, 将 Compact 转移到弹性计算集群。
相比 RocksDB 自身的 Remote Compaction,ToplingDB 的分布式 Compact 功能完备,使用便捷,对上层应用非常友好
+[topling-rocks](https://github.com/topling/topling-rocks) | **private** | 创建 [Topling**Zip**Table](https://github.com/topling/rockside/wiki/ToplingZipTable),基于 Topling 可检索内存压缩算法的 SST,压缩率更高,且内存占用更低,一般用于 L2 及更深层 SST
+[topling-zip_table_reader](https://github.com/topling/topling-zip_table_reader) | public | 让社区版用户可以读取 Topling**Zip**Table,但创建需要私有仓库 [topling-rocks](https://github.com/topling/topling-rocks)
+
+为了简化编译流程,ToplingDB 在 Makefile 中会自动 clone 各个组件的 github 仓库,社区版用户可以成功 clone 公开的仓库,但克隆私有仓库(例如 topling-rocks)会失败,所以社区版用户编译出来的 ToplingDB 无法创建 Topling**Zip**Table,但可以读取 Topling**Zip**Table。
+
+## 运行 db_bench
+ToplingDB 需要 C++17,推荐 gcc 8.3 以上,或者 clang 也行。
+
+即便没有 Topling**Zip**Table,ToplingDB 也比 RocksDB 要快得多,您可以通过运行 db_bench 来验证性能:
+```bash
+sudo yum -y install git libaio-devel gcc-c++ gflags-devel zlib-devel bzip2-devel libcurl-devel liburing-devel
+#sudo apt-get update -y && sudo apt-get install -y libjemalloc-dev libaio-dev libgflags-dev zlib1g-dev libbz2-dev libcurl4-gnutls-dev liburing-dev libsnappy-dev libbz2-dev liblz4-dev libzstd-dev
+git clone https://github.com/topling/toplingdb
+cd toplingdb
+make -j`nproc` db_bench DEBUG_LEVEL=0
+cp sideplugin/rockside/src/topling/web/{style.css,index.html} ${/path/to/dbdir}
+cp sideplugin/rockside/sample-conf/db_bench_*.yaml .
+export LD_LIBRARY_PATH=`find sideplugin -name lib_shared`
+# change db_bench_community.yaml as your needs
+# 1. use default path(/dev/shm) if you have no fast disk(such as a cloud server)
+# 2. change max_background_compactions to your cpu core num
+# 3. if you have github repo topling-rocks permissions, you can use db_bench_enterprise.yaml
+# 4. use db_bench_community.yaml is faster than upstream RocksDB
+# 5. use db_bench_enterprise.yaml is much faster than db_bench_community.yaml
+# command option -json can accept json and yaml files, here use yaml file for more human readable
+./db_bench -json=db_bench_community.yaml -num=10000000 -disable_wal=true -value_size=20 -benchmarks=fillrandom,readrandom -batch_size=10
+# you can access http://127.0.0.1:2011 to see webview
+# you can see this db_bench is much faster than RocksDB
+```
+## 可配置的功能
+为了性能和简化,ToplingDB 默认禁用了一些 RocksDB 的功能:
+
+功能|控制参数(预编译宏)
+-------|-------------
+动态创建 ColumnFamily | ROCKSDB_DYNAMIC_CREATE_CF
+用户层 timestamp | TOPLINGDB_WITH_TIMESTAMP
+宽列 | TOPLINGDB_WITH_WIDE_COLUMNS
+
+**注意**: SidePlugin 暂不支持动态创建 ColumnFamily,混用 SidePlugin 和动态创建 ColumnFamily时,动态创建的 ColumnFamily 不能在 Web 中展示
+
+为了启用这些功能,需要为 make 命令显式添加 `EXTRA_CXXFLAGS="-D${MACRO_1} -D${MACRO_2} ..."`,例如编译带动态创建 ColumnFamily 的 rocksdbjava:
+```
+make -j`nproc` EXTRA_CXXFLAGS='-DROCKSDB_DYNAMIC_CREATE_CF' rocksdbjava
+```
+## License
+为了兼容开源协议,下列原先禁止字节跳动使用本软件的条款从 2023-04-24 起已被删除,也就是说,字节跳动使用 ToplingDB 的行为不再是非法的,也不是无耻的。
+
+~~我们禁止字节跳动使用本软件,其它条款与上游 RocksDB 完全相同,~~ 详情参考 [LICENSE.Apache](LICENSE.Apache), [COPYING](COPYING), [LICENSE.leveldb](LICENSE.leveldb).
+
+相应 LICENSE 文件中禁止字节跳动使用本软件的条款也已经删除:[LICENSE.Apache](LICENSE.Apache), [COPYING](COPYING), [LICENSE.leveldb](LICENSE.leveldb).
+
+
+以下是上游 RocksDB 的原版 README
+
+
+
+## RocksDB: A Persistent Key-Value Store for Flash and RAM Storage
+
+[![CircleCI Status](https://circleci.com/gh/facebook/rocksdb.svg?style=svg)](https://circleci.com/gh/facebook/rocksdb)
+[![Appveyor Build status](https://ci.appveyor.com/api/projects/status/fbgfu0so3afcno78/branch/main?svg=true)](https://ci.appveyor.com/project/Facebook/rocksdb/branch/main)
+[![PPC64le Build Status](http://140-211-168-68-openstack.osuosl.org:8080/buildStatus/icon?job=rocksdb&style=plastic)](http://140-211-168-68-openstack.osuosl.org:8080/job/rocksdb)
+
+RocksDB is developed and maintained by Facebook Database Engineering Team.
+It is built on earlier work on [LevelDB](https://github.com/google/leveldb) by Sanjay Ghemawat (sanjay@google.com)
+and Jeff Dean (jeff@google.com)
+
+This code is a library that forms the core building block for a fast
+key-value server, especially suited for storing data on flash drives.
+It has a Log-Structured-Merge-Database (LSM) design with flexible tradeoffs
+between Write-Amplification-Factor (WAF), Read-Amplification-Factor (RAF)
+and Space-Amplification-Factor (SAF). It has multi-threaded compactions,
+making it especially suitable for storing multiple terabytes of data in a
+single database.
+
+Start with example usage here: https://github.com/facebook/rocksdb/tree/main/examples
+
+See the [github wiki](https://github.com/facebook/rocksdb/wiki) for more explanation.
+
+The public interface is in `include/`. Callers should not include or
+rely on the details of any other header files in this package. Those
+internal APIs may be changed without warning.
+
+Questions and discussions are welcome on the [RocksDB Developers Public](https://www.facebook.com/groups/rocksdb.dev/) Facebook group and [email list](https://groups.google.com/g/rocksdb) on Google Groups.
+
+## License
+
+RocksDB is dual-licensed under both the GPLv2 (found in the COPYING file in the root directory) and Apache 2.0 License (found in the LICENSE.Apache file in the root directory). You may select, at your option, one of the above-listed licenses.
diff --git a/README.md b/README.md
index 8fcc4abc2c..d68c65285e 100644
--- a/README.md
+++ b/README.md
@@ -1,3 +1,114 @@
+## [中文版](README-zh_cn.md)
+## ToplingDB: A Persistent Key-Value Store for External Storage
+ToplingDB is developed and maintained by [Topling Inc](https://topling.cn). It is built with [RocksDB](https://github.com/facebook/rocksdb). See [ToplingDB Branch Name Convention](https://github.com/topling/toplingdb/wiki/ToplingDB-Branch-Name-Convention).
+
+ToplingDB's submodule **[rockside](https://github.com/topling/rockside)** is the entry point of ToplingDB, see **[SidePlugin wiki](https://github.com/topling/rockside/wiki)**.
+
+ToplingDB has much more key features than RocksDB:
+1. [SidePlugin](https://github.com/topling/rockside/wiki) enables users to write a json(or yaml) to define DB configs
+1. [Embedded Http Server](https://github.com/topling/rockside/wiki/WebView) enables users to view almost all DB info on web, this is a component of [SidePlugin](https://github.com/topling/rockside/wiki)
+1. [Embedded Http Server](https://github.com/topling/rockside/wiki/WebView) enables users to [online change](https://github.com/topling/rockside/wiki/Online-Change-Options) db/cf options and all db meta objects(such as MemTabFactory, TableFactory, WriteBufferManager ...) without restart the running process
+1. Many improvements and refactories on RocksDB, aimed for performance and extendibility
+1. Topling transaction lock management, 5x faster than rocksdb
+1. MultiGet with concurrent IO by fiber/coroutine + io_uring, much faster than RocksDB's async MultiGet
+1. Topling [de-virtualization](https://github.com/topling/rockside/wiki/Devirtualization-And-Key-Prefix-Cache-Principle), de-virtualize hotspot (virtual) functions, and key prefix caches, [bechmarks](https://github.com/topling/rockside/wiki/Devirtualization-And-Key-Prefix-Cache-Benchmark)
+1. Topling zero copy for point search(Get/MultiGet) and Iterator
+1. Builtin SidePlugin**s** for existing RocksDB components(Cache, Comparator, TableFactory, MemTableFactory...)
+1. Builtin Prometheus metrics support, this is based on [Embedded Http Server](https://github.com/topling/rockside/wiki/WebView)
+1. Many bugfixes for RocksDB, a small part of such fixes was [Pull Requested](https://github.com/facebook/rocksdb/pulls?q=is%3Apr+author%3Arockeet) to [upstream RocksDB](https://github.com/facebook/rocksdb)
+
+## ToplingDB cloud native DB services
+1. [MyTopling](https://github.com/topling/mytopling)(MySQL on ToplingDB), [Managed MyTopling on aliyun](https://topling.cn/products/mytopling/)
+1. [Todis](https://github.com/topling/todis)(Redis on ToplingDB), [Managed Todis on aliyun](https://topling.cn/products/todis-enterprise/)
+
+## ToplingDB Components
+With SidePlugin mechanics, plugins/components can be physically separated from core toplingdb
+1. Can be compiled to a separated dynamic lib and loaded at runtime
+2. User code need not any changes, just change json/yaml files
+3. Topling's non-open-source enterprise plugins/components are delivered in this way
+
+### Repository dir structure
+```bash
+toplingdb
+ \__ sideplugin
+ \__ rockside (submodule , sideplugin core and framework)
+ \__ topling-zip (auto clone, zip and core lib)
+ \__ cspp-memtab (auto clone, sideplugin component)
+ \__ cspp-wbwi (auto clone, sideplugin component)
+ \__ topling-sst (auto clone, sideplugin component)
+ \__ topling-rocks (auto clone, sideplugin component)
+ \__ topling-zip_table_reader (auto clone, sideplugin component)
+ \__ topling-dcompact (auto clone, sideplugin component)
+ \_ tools/dcompact (dcompact-worker binary app)
+```
+
+ Repository | Permission | Description (and components)
+-------------- | ---------- | -----------
+[ToplingDB](https://github.com/topling/toplingdb) | public | Top repository, forked from [RocksDB](https://github.com/facebook/rocksdb) with our fixes, refactories and enhancements
+[rockside](https://github.com/topling/rockside) | public | This is a submodule, contains:- SidePlugin framework and Builtin SidePlugin**s**
- Embedded Http Server and Prometheus metrics
+[cspp-wbwi
(**W**rite**B**atch**W**ith**I**ndex)](https://github.com/topling/cspp-wbwi) | public | With CSPP and carefully coding, **CSPP_WBWI** is 20x faster than rocksdb SkipList based WBWI
+[cspp-memtable](https://github.com/topling/cspp-memtable) | public | (**CSPP** is **C**rash **S**afe **P**arallel **P**atricia trie) MemTab, which outperforms SkipList on all aspects: 3x lower memory usage, 7x single thread performance, perfect multi-thread scaling)
+[topling-sst](https://github.com/topling/topling-sst) | public | 1. [SingleFastTable](https://github.com/topling/rockside/wiki/SingleFastTable)(designed for L0 and L1)
2. VecAutoSortTable(designed for MyTopling bulk_load).
3. Deprecated [ToplingFastTable](https://github.com/topling/rockside/wiki/ToplingFastTable), CSPPAutoSortTable
+[topling-dcompact](https://github.com/topling/topling-dcompact) | public | Distributed Compaction with general dcompact_worker application, offload compactions to elastic computing clusters, much more powerful than RocksDB's Remote Compaction
+[topling-rocks](https://github.com/topling/topling-rocks) | **private** | [Topling**Zip**Table](https://github.com/topling/rockside/wiki/ToplingZipTable), an SST implementation optimized for RAM and SSD space, aimed for L2+ level compaction, which uses topling dedicated searchable in-memory data compression algorithms
+[topling-zip_table_reader](https://github.com/topling/topling-zip_table_reader) | public | For read Topling**Zip**Table by community users, builder of Topling**Zip**Table is in [topling-rocks](https://github.com/topling/topling-rocks)
+
+To simplify the compiling, repo**s** are auto cloned in ToplingDB's Makefile, community users will auto clone public repo successfully but fail to auto clone **private** repo, thus ToplingDB is built without **private** components, this is so called **community** version.
+
+## Run db_bench
+ToplingDB requires C++17, gcc 8.3 or newer is recommended, clang also works.
+
+Even without ToplingZipTable, ToplingDB is much faster than upstream RocksDB:
+```bash
+sudo yum -y install git libaio-devel gcc-c++ gflags-devel zlib-devel bzip2-devel libcurl-devel liburing-devel
+#sudo apt-get update -y && sudo apt-get install -y libjemalloc-dev libaio-dev libgflags-dev zlib1g-dev libbz2-dev libcurl4-gnutls-dev liburing-dev libsnappy-dev libbz2-dev liblz4-dev libzstd-dev
+git clone https://github.com/topling/toplingdb
+cd toplingdb
+make -j`nproc` db_bench DEBUG_LEVEL=0
+cp sideplugin/rockside/src/topling/web/{style.css,index.html} ${/path/to/dbdir}
+cp sideplugin/rockside/sample-conf/db_bench_*.yaml .
+export LD_LIBRARY_PATH=`find sideplugin -name lib_shared`
+# change db_bench_community.yaml as your needs
+# 1. use default path(/dev/shm) if you have no fast disk(such as a cloud server)
+# 2. change max_background_compactions to your cpu core num
+# 3. if you have github repo topling-rocks permissions, you can use db_bench_enterprise.yaml
+# 4. use db_bench_community.yaml is faster than upstream RocksDB
+# 5. use db_bench_enterprise.yaml is much faster than db_bench_community.yaml
+# command option -json can accept json and yaml files, here use yaml file for more human readable
+./db_bench -json=db_bench_community.yaml -num=10000000 -disable_wal=true -value_size=20 -benchmarks=fillrandom,readrandom -batch_size=10
+# you can access http://127.0.0.1:2011 to see webview
+# you can see this db_bench is much faster than RocksDB
+```
+## Configurable features
+For performance and simplicity, ToplingDB disabled some RocksDB features by default:
+
+Feature|Control MACRO
+-------|-------------
+Dynamic creation of ColumnFamily | ROCKSDB_DYNAMIC_CREATE_CF
+User level timestamp on key | TOPLINGDB_WITH_TIMESTAMP
+Wide Columns | TOPLINGDB_WITH_WIDE_COLUMNS
+
+**Note**: Dynamic creation of ColumnFamily is not supported by SidePlugin
+
+To enable these features, add `-D${MACRO_NAME}` to var `EXTRA_CXXFLAGS`, such as build ToplingDB for java with dynamic ColumnFamily:
+```
+make -j`nproc` EXTRA_CXXFLAGS='-DROCKSDB_DYNAMIC_CREATE_CF' rocksdbjava
+```
+## License
+To conform open source license, the following term of disallowing bytedance is deleted since 2023-04-24,
+that is say: bytedance using ToplingDB is no longer illeagal and is not a shame.
+
+~~We disallow bytedance using this software, other terms are identidal with
+upstream rocksdb license,~~ see [LICENSE.Apache](LICENSE.Apache), [COPYING](COPYING) and
+[LICENSE.leveldb](LICENSE.leveldb).
+
+The terms of disallowing bytedance are also deleted in [LICENSE.Apache](LICENSE.Apache), [COPYING](COPYING) and
+[LICENSE.leveldb](LICENSE.leveldb).
+
+
+
+
+
## RocksDB: A Persistent Key-Value Store for Flash and RAM Storage
[![CircleCI Status](https://circleci.com/gh/facebook/rocksdb.svg?style=svg)](https://circleci.com/gh/facebook/rocksdb)
diff --git a/TARGETS b/TARGETS
index e8aaf325d4..8a851d7502 100644
--- a/TARGETS
+++ b/TARGETS
@@ -39,6 +39,7 @@ cpp_library_wrapper(name="rocksdb_lib", srcs=[
"db/c.cc",
"db/column_family.cc",
"db/compaction/compaction.cc",
+ "db/compaction/compaction_executor.cc",
"db/compaction/compaction_iterator.cc",
"db/compaction/compaction_job.cc",
"db/compaction/compaction_outputs.cc",
@@ -176,6 +177,15 @@ cpp_library_wrapper(name="rocksdb_lib", srcs=[
"port/win/port_win.cc",
"port/win/win_logger.cc",
"port/win/win_thread.cc",
+ "sideplugin/rockside/src/topling/block_based_table_side_plugin.cc",
+ "sideplugin/rockside/src/topling/builtin_db_open.cc",
+ "sideplugin/rockside/src/topling/builtin_plugin_basic.cc",
+ "sideplugin/rockside/src/topling/builtin_plugin_misc.cc",
+ "sideplugin/rockside/src/topling/builtin_table_factory.cc",
+ "sideplugin/rockside/src/topling/side_plugin_repo.cc",
+ "sideplugin/rockside/src/topling/side_plugin_tpl_inst.cc",
+ "sideplugin/rockside/src/topling/web/CivetServer.cc",
+ "sideplugin/rockside/src/topling/web/json_civetweb.cc",
"table/adaptive/adaptive_table_factory.cc",
"table/block_based/binary_search_index_reader.cc",
"table/block_based/block.cc",
diff --git a/build_tools/build_detect_platform b/build_tools/build_detect_platform
index a5e2b5aa2f..2a46b0209b 100755
--- a/build_tools/build_detect_platform
+++ b/build_tools/build_detect_platform
@@ -49,7 +49,7 @@ fi
if [ "$ROCKSDB_CXX_STANDARD" ]; then
PLATFORM_CXXFLAGS="-std=$ROCKSDB_CXX_STANDARD"
else
- PLATFORM_CXXFLAGS="-std=c++17"
+ PLATFORM_CXXFLAGS="-std=gnu++17"
fi
# we currently depend on POSIX platform
@@ -238,7 +238,7 @@ EOF
Cygwin)
PLATFORM=CYGWIN
PLATFORM_SHARED_CFLAGS=""
- PLATFORM_CXXFLAGS="-std=gnu++11"
+ PLATFORM_CXXFLAGS="-std=gnu++17"
COMMON_FLAGS="$COMMON_FLAGS -DCYGWIN"
if [ -z "$USE_CLANG" ]; then
COMMON_FLAGS="$COMMON_FLAGS -fno-builtin-memcmp"
@@ -334,6 +334,9 @@ EOF
then
COMMON_FLAGS="$COMMON_FLAGS -DGFLAGS=1 -DGFLAGS_NAMESPACE=google"
PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lgflags"
+ else
+ echo Not found: GFLAGS 1>&2
+ exit 1
fi
fi
@@ -347,6 +350,9 @@ EOF
COMMON_FLAGS="$COMMON_FLAGS -DZLIB"
PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS -lz"
JAVA_LDFLAGS="$JAVA_LDFLAGS -lz"
+ else
+ echo Not found: zlib "(for gzip)" 1>&2
+ exit 1
fi
fi
diff --git a/cache/lru_cache.cc b/cache/lru_cache.cc
index 9d16952243..3080acfd29 100644
--- a/cache/lru_cache.cc
+++ b/cache/lru_cache.cc
@@ -528,14 +528,14 @@ LRUHandle* LRUCacheShard::CreateHandle(const Slice& key, uint32_t hash,
// Allocate the memory here outside of the mutex.
// If the cache is full, we'll have to release it.
// It shouldn't happen very often though.
- LRUHandle* e =
- static_cast(malloc(sizeof(LRUHandle) - 1 + key.size()));
-
+ static_assert(sizeof(LRUHandle) == 64);
+ auto e = static_cast(malloc(sizeof(LRUHandle) + key.size()));
+ e->padding = 0; // padding makes key_data aligned better
e->value = value;
e->m_flags = 0;
e->im_flags = 0;
e->helper = helper;
- e->key_length = key.size();
+ e->key_length = (uint32_t)key.size();
e->hash = hash;
e->refs = 0;
e->next = e->prev = nullptr;
diff --git a/cache/lru_cache.h b/cache/lru_cache.h
index 1a9ba04425..33fdc79a73 100644
--- a/cache/lru_cache.h
+++ b/cache/lru_cache.h
@@ -54,7 +54,7 @@ struct LRUHandle {
LRUHandle* next;
LRUHandle* prev;
size_t total_charge; // TODO(opt): Only allow uint32_t?
- size_t key_length;
+ uint32_t key_length;
// The hash of key(). Used for fast sharding and comparisons.
uint32_t hash;
// The number of external refs to this entry. The cache itself is not counted.
@@ -87,8 +87,10 @@ struct LRUHandle {
IM_IS_STANDALONE = (1 << 2),
};
+ uint16_t padding;
+
// Beginning of the key (MUST BE THE LAST FIELD IN THIS STRUCT!)
- char key_data[1];
+ char key_data[0];
Slice key() const { return Slice(key_data, key_length); }
diff --git a/db/arena_wrapped_db_iter.cc b/db/arena_wrapped_db_iter.cc
index e6dcb66962..3a8b37d466 100644
--- a/db/arena_wrapped_db_iter.cc
+++ b/db/arena_wrapped_db_iter.cc
@@ -19,14 +19,28 @@
namespace ROCKSDB_NAMESPACE {
-inline static SequenceNumber GetSeqNum(const DBImpl* db, const Snapshot* s) {
- if (s) {
+static constexpr size_t KEEP_SNAPSHOT = 16;
+
+inline static
+SequenceNumber GetSeqNum(const DBImpl* db, const Snapshot* s, const DBIter* i) {
+ if (size_t(s) == KEEP_SNAPSHOT)
+ return i->get_sequence();
+ else if (s)
+ //return static_cast_with_check(s)->number_;
return s->GetSequenceNumber();
- } else {
+ else
return db->GetLatestSequenceNumber();
- }
}
+Status Iterator::RefreshKeepSnapshot(bool keep_iter_pos) {
+ return Refresh(reinterpret_cast(KEEP_SNAPSHOT), keep_iter_pos);
+}
+
+ArenaWrappedDBIter::ArenaWrappedDBIter() {
+ // do nothing
+}
+#define db_iter_ (&db_iter_obj_)
+
Status ArenaWrappedDBIter::GetProperty(std::string prop_name,
std::string* prop) {
if (prop_name == "rocksdb.iterator.super-version-number") {
@@ -45,14 +59,15 @@ void ArenaWrappedDBIter::Init(
const SequenceNumber& sequence, uint64_t max_sequential_skip_in_iteration,
uint64_t version_number, ReadCallback* read_callback, DBImpl* db_impl,
ColumnFamilyData* cfd, bool expose_blob_index, bool allow_refresh) {
- auto mem = arena_.AllocateAligned(sizeof(DBIter));
- db_iter_ =
+ auto mem = db_iter_;
new (mem) DBIter(env, read_options, ioptions, mutable_cf_options,
ioptions.user_comparator, /* iter */ nullptr, version,
sequence, true, max_sequential_skip_in_iteration,
read_callback, db_impl, cfd, expose_blob_index);
+ db_iter_inited_ = true;
sv_number_ = version_number;
read_options_ = read_options;
+ read_options_.pinning_tls = nullptr; // must set null
allow_refresh_ = allow_refresh;
memtable_range_tombstone_iter_ = nullptr;
@@ -62,12 +77,17 @@ void ArenaWrappedDBIter::Init(
}
}
-Status ArenaWrappedDBIter::Refresh() { return Refresh(nullptr); }
+Status ArenaWrappedDBIter::Refresh() {
+ return Refresh(nullptr, false); // do not keep iter pos
+}
-Status ArenaWrappedDBIter::Refresh(const Snapshot* snapshot) {
+// when keep_iter_pos is true, user code should ensure ReadOptions's
+// lower_bound and upper_bound are not changed
+Status ArenaWrappedDBIter::Refresh(const Snapshot* snap, bool keep_iter_pos) {
if (cfd_ == nullptr || db_impl_ == nullptr || !allow_refresh_) {
return Status::NotSupported("Creating renew iterator is not allowed.");
}
+ assert(db_iter_inited_);
assert(db_iter_ != nullptr);
// TODO(yiwu): For last_seq_same_as_publish_seq_==false, this is not the
// correct behavior. Will be corrected automatically when we take a snapshot
@@ -80,7 +100,25 @@ Status ArenaWrappedDBIter::Refresh(const Snapshot* snapshot) {
TEST_SYNC_POINT("ArenaWrappedDBIter::Refresh:1");
TEST_SYNC_POINT("ArenaWrappedDBIter::Refresh:2");
auto reinit_internal_iter = [&]() {
+ std::string curr_key, curr_val;
+ bool is_valid = this->Valid();
+ SequenceNumber old_iter_seq = db_iter_->get_sequence();
+ SequenceNumber latest_seq = GetSeqNum(db_impl_, snap, db_iter_);
+ if (is_valid && keep_iter_pos) {
+ curr_key = this->key().ToString();
+ curr_val = this->value().ToString();
+ }
+ Snapshot* pin_snap = nullptr;
+ if (size_t(snap) == KEEP_SNAPSHOT) {
+ // pin the snapshot latest_seq to avoid race condition caused by
+ // the the snapshot latest_seq being garbage collected by a
+ // compaction, which may cause many errors, for example an external
+ // behavior is Seek on belowing new iterator failed(with same
+ // read_opt.lower_bound/upper_bound...)
+ pin_snap = db_impl_->GetSnapshotImpl(latest_seq, false);
+ }
Env* env = db_iter_->env();
+ db_iter_inited_ = false;
db_iter_->~DBIter();
arena_.~Arena();
new (&arena_) Arena();
@@ -101,13 +139,36 @@ Status ArenaWrappedDBIter::Refresh(const Snapshot* snapshot) {
read_options_, cfd_, sv, &arena_, read_seq,
/* allow_unprepared_value */ true, /* db_iter */ this);
SetIterUnderDBIter(internal_iter);
+ if (is_valid && keep_iter_pos) {
+ this->Seek(curr_key);
+ if (old_iter_seq == latest_seq) {
+ ROCKSDB_VERIFY_F(this->Valid(),
+ "curr_key = %s, seq = %lld, snap = %p, pin_snap = %p",
+ Slice(curr_key).hex().c_str(),
+ (long long)latest_seq, snap, pin_snap);
+ ROCKSDB_VERIFY_F(key() == curr_key, "%s %s",
+ key().hex().c_str(), Slice(curr_key).hex().c_str());
+ ROCKSDB_VERIFY_F(value() == curr_val, "%s %s",
+ value().hex().c_str(), Slice(curr_val).hex().c_str());
+ }
+ }
+ if (pin_snap) {
+ db_impl_->ReleaseSnapshot(pin_snap);
+ }
};
while (true) {
if (sv_number_ != cur_sv_number) {
reinit_internal_iter();
break;
+ } else if (size_t(snap) == KEEP_SNAPSHOT) {
+ break;
} else {
SequenceNumber read_seq = GetSeqNum(db_impl_, snapshot);
+ SequenceNumber latest_seq = snap ? snap->GetSequenceNumber()
+ : db_impl_->GetLatestSequenceNumber();
+ if (latest_seq == db_iter_->get_sequence()) {
+ break;
+ }
// Refresh range-tombstones in MemTable
if (!read_options_.ignore_range_deletions) {
SuperVersion* sv = cfd_->GetThreadLocalSuperVersion(db_impl_);
@@ -143,6 +204,9 @@ Status ArenaWrappedDBIter::Refresh(const Snapshot* snapshot) {
}
db_impl_->ReturnAndCleanupSuperVersion(cfd_, sv);
}
+ // Refresh latest sequence number
+ db_iter_->set_sequence(latest_seq);
+ // db_iter_->set_valid(false); // comment out for ToplingDB
// Check again if the latest super version number is changed
uint64_t latest_sv_number = cfd_->GetSuperVersionNumber();
if (latest_sv_number != cur_sv_number) {
@@ -156,15 +220,24 @@ Status ArenaWrappedDBIter::Refresh(const Snapshot* snapshot) {
break;
}
}
+ if (size_t(snap) > KEEP_SNAPSHOT) {
+ this->read_options_.snapshot = snap;
+ }
return Status::OK();
}
ArenaWrappedDBIter* NewArenaWrappedDbIterator(
- Env* env, const ReadOptions& read_options, const ImmutableOptions& ioptions,
- const MutableCFOptions& mutable_cf_options, const Version* version,
- const SequenceNumber& sequence, uint64_t max_sequential_skip_in_iterations,
- uint64_t version_number, ReadCallback* read_callback, DBImpl* db_impl,
- ColumnFamilyData* cfd, bool expose_blob_index, bool allow_refresh) {
+ const ReadOptions& read_options, const SuperVersion* sv,
+ SequenceNumber sequence, ReadCallback* read_callback, DBImpl* db_impl,
+ bool expose_blob_index, bool allow_refresh) {
+ auto version = sv->current;
+ auto version_number = sv->version_number;
+ auto env = version->env();
+ auto cfd = sv->cfd;
+ const auto& ioptions = *cfd->ioptions();
+ const auto& mutable_cf_options = sv->mutable_cf_options;
+ auto max_sequential_skip_in_iterations =
+ mutable_cf_options.max_sequential_skip_in_iterations;
ArenaWrappedDBIter* iter = new ArenaWrappedDBIter();
iter->Init(env, read_options, ioptions, mutable_cf_options, version, sequence,
max_sequential_skip_in_iterations, version_number, read_callback,
diff --git a/db/arena_wrapped_db_iter.h b/db/arena_wrapped_db_iter.h
index 678ea3e78d..2088ed4390 100644
--- a/db/arena_wrapped_db_iter.h
+++ b/db/arena_wrapped_db_iter.h
@@ -33,10 +33,12 @@ class Version;
// to allocate.
// When using the class's Iterator interface, the behavior is exactly
// the same as the inner DBIter.
+#define db_iter_ (&db_iter_obj_)
class ArenaWrappedDBIter : public Iterator {
public:
+ ArenaWrappedDBIter();
~ArenaWrappedDBIter() override {
- if (db_iter_ != nullptr) {
+ if (db_iter_inited_) {
db_iter_->~DBIter();
} else {
assert(false);
@@ -70,17 +72,21 @@ class ArenaWrappedDBIter : public Iterator {
}
void Next() override { db_iter_->Next(); }
void Prev() override { db_iter_->Prev(); }
+ ROCKSDB_FLATTEN
Slice key() const override { return db_iter_->key(); }
+ ROCKSDB_FLATTEN
Slice value() const override { return db_iter_->value(); }
const WideColumns& columns() const override { return db_iter_->columns(); }
Status status() const override { return db_iter_->status(); }
Slice timestamp() const override { return db_iter_->timestamp(); }
+ ROCKSDB_FLATTEN
+ bool PrepareValue() override { return db_iter_->PrepareValue(); }
bool IsBlob() const { return db_iter_->IsBlob(); }
Status GetProperty(std::string prop_name, std::string* prop) override;
Status Refresh() override;
- Status Refresh(const Snapshot*) override;
+ Status Refresh(const Snapshot*, bool keep_iter_pos) override;
void Init(Env* env, const ReadOptions& read_options,
const ImmutableOptions& ioptions,
@@ -101,7 +107,7 @@ class ArenaWrappedDBIter : public Iterator {
}
private:
- DBIter* db_iter_ = nullptr;
+ union { DBIter db_iter_obj_; };
Arena arena_;
uint64_t sv_number_;
ColumnFamilyData* cfd_ = nullptr;
@@ -110,19 +116,18 @@ class ArenaWrappedDBIter : public Iterator {
ReadCallback* read_callback_;
bool expose_blob_index_ = false;
bool allow_refresh_ = true;
+ bool db_iter_inited_ = false;
// If this is nullptr, it means the mutable memtable does not contain range
// tombstone when added under this DBIter.
TruncatedRangeDelIterator** memtable_range_tombstone_iter_ = nullptr;
};
+#undef db_iter_
// Generate the arena wrapped iterator class.
// `db_impl` and `cfd` are used for reneweal. If left null, renewal will not
// be supported.
extern ArenaWrappedDBIter* NewArenaWrappedDbIterator(
- Env* env, const ReadOptions& read_options, const ImmutableOptions& ioptions,
- const MutableCFOptions& mutable_cf_options, const Version* version,
- const SequenceNumber& sequence, uint64_t max_sequential_skip_in_iterations,
- uint64_t version_number, ReadCallback* read_callback,
- DBImpl* db_impl = nullptr, ColumnFamilyData* cfd = nullptr,
+ const ReadOptions&, const SuperVersion*, SequenceNumber sequence,
+ ReadCallback*, DBImpl* db_impl = nullptr,
bool expose_blob_index = false, bool allow_refresh = true);
} // namespace ROCKSDB_NAMESPACE
diff --git a/db/blob/blob_counting_iterator_test.cc b/db/blob/blob_counting_iterator_test.cc
index c7bbc8f587..eced3f2167 100644
--- a/db/blob/blob_counting_iterator_test.cc
+++ b/db/blob/blob_counting_iterator_test.cc
@@ -136,7 +136,7 @@ TEST(BlobCountingIteratorTest, CountBlobs) {
{
IterateResult result;
ASSERT_TRUE(blob_counter.NextAndGetResult(&result));
- ASSERT_EQ(result.key, keys[1]);
+ ASSERT_EQ(result.key(), keys[1]);
ASSERT_EQ(blob_counter.user_key(), user_key1);
ASSERT_TRUE(blob_counter.Valid());
ASSERT_OK(blob_counter.status());
@@ -151,7 +151,7 @@ TEST(BlobCountingIteratorTest, CountBlobs) {
{
IterateResult result;
ASSERT_TRUE(blob_counter.NextAndGetResult(&result));
- ASSERT_EQ(result.key, keys[2]);
+ ASSERT_EQ(result.key(), keys[2]);
ASSERT_EQ(blob_counter.user_key(), user_key2);
ASSERT_TRUE(blob_counter.Valid());
ASSERT_OK(blob_counter.status());
diff --git a/db/blob/blob_fetcher.h b/db/blob/blob_fetcher.h
index 8aeaf965d2..ad6dda64b3 100644
--- a/db/blob/blob_fetcher.h
+++ b/db/blob/blob_fetcher.h
@@ -19,6 +19,7 @@ class BlobIndex;
// A thin wrapper around the blob retrieval functionality of Version.
class BlobFetcher {
public:
+ virtual ~BlobFetcher() = default;
BlobFetcher(const Version* version, const ReadOptions& read_options)
: version_(version), read_options_(read_options) {}
@@ -32,6 +33,14 @@ class BlobFetcher {
private:
const Version* version_;
- ReadOptions read_options_;
+ const ReadOptions& read_options_;
};
+
+class BlobFetcherCopyReadOptions : public BlobFetcher {
+ const ReadOptions read_options_copy_;
+public:
+ BlobFetcherCopyReadOptions(const Version* v, const ReadOptions& ro)
+ : BlobFetcher(v, read_options_copy_), read_options_copy_(ro) {}
+};
+
} // namespace ROCKSDB_NAMESPACE
diff --git a/db/builder.cc b/db/builder.cc
index d3040ee9e2..a5070297ee 100644
--- a/db/builder.cc
+++ b/db/builder.cc
@@ -413,13 +413,20 @@ Status BuildTable(
OutputValidator file_validator(tboptions.internal_comparator,
/*enable_order_check=*/true,
/*enable_hash=*/true);
+ file_validator.m_file_number = meta->fd.GetNumber();
for (it->SeekToFirst(); it->Valid(); it->Next()) {
// Generate a rolling 64-bit hash of the key and values
file_validator.Add(it->key(), it->value()).PermitUncheckedError();
}
s = it->status();
if (s.ok() && !output_validator.CompareValidator(file_validator)) {
- s = Status::Corruption("Paranoid checksums do not match");
+ #if !defined(ROCKSDB_UNIT_TEST)
+ auto& fd = meta->fd;
+ ROCKSDB_DIE("BuildTable: Paranoid checksums do not match(%d:%lld.sst)",
+ fd.GetPathId(), (long long)fd.GetNumber());
+ #else
+ s = Status::Corruption("BuildTable: Paranoid checksums do not match");
+ #endif
}
}
}
diff --git a/db/column_family.cc b/db/column_family.cc
index 9782cd31a7..964b5861a1 100644
--- a/db/column_family.cc
+++ b/db/column_family.cc
@@ -79,10 +79,10 @@ ColumnFamilyHandleImpl::~ColumnFamilyHandleImpl() {
}
}
-uint32_t ColumnFamilyHandleImpl::GetID() const { return cfd()->GetID(); }
+uint32_t ColumnFamilyHandleImpl::GetID() const { return cfd_->GetID(); }
const std::string& ColumnFamilyHandleImpl::GetName() const {
- return cfd()->GetName();
+ return cfd_->GetName();
}
Status ColumnFamilyHandleImpl::GetDescriptor(ColumnFamilyDescriptor* desc) {
@@ -93,7 +93,25 @@ Status ColumnFamilyHandleImpl::GetDescriptor(ColumnFamilyDescriptor* desc) {
}
const Comparator* ColumnFamilyHandleImpl::GetComparator() const {
- return cfd()->user_comparator();
+ return cfd_->user_comparator();
+}
+ColumnFamilyHandle* ColumnFamilyHandleImpl::CloneHandle() const {
+ return new ColumnFamilyHandleImpl(cfd_, db_, mutex_);
+}
+
+uint32_t ColumnFamilyHandleInternal::GetID() const {
+ return internal_cfd_->GetID();
+}
+const std::string& ColumnFamilyHandleInternal::GetName() const {
+ return internal_cfd_->GetName();
+}
+const Comparator* ColumnFamilyHandleInternal::GetComparator() const {
+ return internal_cfd_->user_comparator();
+}
+ColumnFamilyHandle* ColumnFamilyHandleInternal::CloneHandle() const {
+ auto p = new ColumnFamilyHandleInternal();
+ p->SetCFD(internal_cfd_);
+ return p;
}
void GetIntTblPropCollectorFactory(
@@ -541,7 +559,7 @@ ColumnFamilyData::ColumnFamilyData(
ioptions_.max_write_buffer_size_to_maintain),
super_version_(nullptr),
super_version_number_(0),
- local_sv_(new ThreadLocalPtr(&SuperVersionUnrefHandle)),
+ local_sv_(&SuperVersionUnrefHandle),
next_(nullptr),
prev_(nullptr),
log_number_(0),
@@ -717,7 +735,12 @@ bool ColumnFamilyData::UnrefAndTryDelete() {
super_version_ = nullptr;
// Release SuperVersion references kept in ThreadLocalPtr.
- local_sv_.reset();
+ #if 0
+ local_sv_.~ThreadLocalPtr();
+ new(&local_sv_)ThreadLocalPtr(&SuperVersionUnrefHandle);
+ #else
+ local_sv_.Destroy();
+ #endif
if (sv->Unref()) {
// Note: sv will delete this ColumnFamilyData during Cleanup()
@@ -763,7 +786,11 @@ uint64_t ColumnFamilyData::OldestLogToKeep() {
return current_log;
}
+#if defined(ROCKSDB_UNIT_TEST)
const double kIncSlowdownRatio = 0.8;
+#else
+const double kIncSlowdownRatio = 0.97; // topling specific
+#endif
const double kDecSlowdownRatio = 1 / kIncSlowdownRatio;
const double kNearStopSlowdownRatio = 0.6;
const double kDelayRecoverSlowdownRatio = 1.4;
@@ -1122,10 +1149,65 @@ uint64_t ColumnFamilyData::GetLiveSstFilesSize() const {
return current_->GetSstFilesSize();
}
+void ColumnFamilyData::PrepareNewMemtableInBackground(
+ const MutableCFOptions& mutable_cf_options) {
+ #if !defined(ROCKSDB_UNIT_TEST)
+ {
+ std::lock_guard lk(precreated_memtable_mutex_);
+ if (precreated_memtable_list_.full()) {
+ // do nothing
+ return;
+ }
+ }
+ auto beg = ioptions_.clock->NowNanos();
+ auto tab = new MemTable(internal_comparator_, ioptions_, mutable_cf_options,
+ write_buffer_manager_, 0/*earliest_seq*/, id_);
+ auto end = ioptions_.clock->NowNanos();
+ RecordInHistogram(ioptions_.stats, MEMTAB_CONSTRUCT_NANOS, end - beg);
+ {
+ std::lock_guard lk(precreated_memtable_mutex_);
+ if (LIKELY(!precreated_memtable_list_.full())) {
+ precreated_memtable_list_.emplace_back(tab);
+ tab = nullptr;
+ }
+ }
+ if (UNLIKELY(nullptr != tab)) { // precreated_memtable_list_ is full
+ // this is very rare, we have not put `tab` to precreated_memtable_list_,
+ // but this thread must keep going on, just delete `tab`
+ ROCKS_LOG_WARN(ioptions_.info_log,
+ "precreated_memtable_list_ is full, discard the newly created memtab");
+ delete tab;
+ }
+ #endif
+}
+
MemTable* ColumnFamilyData::ConstructNewMemtable(
const MutableCFOptions& mutable_cf_options, SequenceNumber earliest_seq) {
- return new MemTable(internal_comparator_, ioptions_, mutable_cf_options,
+ MemTable* tab = nullptr;
+ #if !defined(ROCKSDB_UNIT_TEST)
+ {
+ std::lock_guard lk(precreated_memtable_mutex_);
+ if (!precreated_memtable_list_.empty()) {
+ tab = precreated_memtable_list_.front().release();
+ precreated_memtable_list_.pop_front();
+ }
+ }
+ #endif
+ if (tab) {
+ tab->SetCreationSeq(earliest_seq);
+ tab->SetEarliestSequenceNumber(earliest_seq);
+ } else {
+ #if !defined(ROCKSDB_UNIT_TEST)
+ auto beg = ioptions_.clock->NowNanos();
+ #endif
+ tab = new MemTable(internal_comparator_, ioptions_, mutable_cf_options,
write_buffer_manager_, earliest_seq, id_);
+ #if !defined(ROCKSDB_UNIT_TEST)
+ auto end = ioptions_.clock->NowNanos();
+ RecordInHistogram(ioptions_.stats, MEMTAB_CONSTRUCT_NANOS, end - beg);
+ #endif
+ }
+ return tab;
}
void ColumnFamilyData::CreateNewMemtable(
@@ -1252,6 +1334,12 @@ SuperVersion* ColumnFamilyData::GetReferencedSuperVersion(DBImpl* db) {
return sv;
}
+template
+inline T NoAtomicLoad(const std::atomic& x) {
+ static_assert(sizeof(x) == sizeof(T));
+ return reinterpret_cast(x);
+}
+
SuperVersion* ColumnFamilyData::GetThreadLocalSuperVersion(DBImpl* db) {
// The SuperVersion is cached in thread local storage to avoid acquiring
// mutex when SuperVersion does not change since the last use. When a new
@@ -1264,7 +1352,7 @@ SuperVersion* ColumnFamilyData::GetThreadLocalSuperVersion(DBImpl* db) {
// have swapped in kSVObsolete. We re-check the value at when returning
// SuperVersion back to thread local, with an atomic compare and swap.
// The superversion will need to be released if detected to be stale.
- void* ptr = local_sv_->Swap(SuperVersion::kSVInUse);
+ void* ptr = local_sv_.Swap(SuperVersion::kSVInUse);
// Invariant:
// (1) Scrape (always) installs kSVObsolete in ThreadLocal storage
// (2) the Swap above (always) installs kSVInUse, ThreadLocal storage
@@ -1286,7 +1374,7 @@ bool ColumnFamilyData::ReturnThreadLocalSuperVersion(SuperVersion* sv) {
assert(sv != nullptr);
// Put the SuperVersion back
void* expected = SuperVersion::kSVInUse;
- if (local_sv_->CompareAndSwap(static_cast(sv), expected)) {
+ if (local_sv_.CompareAndSwap(static_cast(sv), expected)) {
// When we see kSVInUse in the ThreadLocal, we are sure ThreadLocal
// storage has not been altered and no Scrape has happened. The
// SuperVersion is still current.
@@ -1354,7 +1442,7 @@ void ColumnFamilyData::InstallSuperVersion(
void ColumnFamilyData::ResetThreadLocalSuperVersions() {
autovector sv_ptrs;
- local_sv_->Scrape(&sv_ptrs, SuperVersion::kSVObsolete);
+ local_sv_.Scrape(&sv_ptrs, SuperVersion::kSVObsolete);
for (auto ptr : sv_ptrs) {
assert(ptr);
if (ptr == SuperVersion::kSVInUse) {
@@ -1608,6 +1696,10 @@ void ColumnFamilyData::RecoverEpochNumbers() {
vstorage->RecoverEpochNumbers(this);
}
+const std::string& ColumnFamilyData::GetDBName() const {
+ return column_family_set_->db_name_;
+}
+
ColumnFamilySet::ColumnFamilySet(const std::string& dbname,
const ImmutableDBOptions* db_options,
const FileOptions& file_options,
diff --git a/db/column_family.h b/db/column_family.h
index 2a38feb731..696e2705be 100644
--- a/db/column_family.h
+++ b/db/column_family.h
@@ -29,6 +29,8 @@
#include "util/hash_containers.h"
#include "util/thread_local.h"
+#include
+
namespace ROCKSDB_NAMESPACE {
class Version;
@@ -167,12 +169,13 @@ class ColumnFamilyHandleImpl : public ColumnFamilyHandle {
InstrumentedMutex* mutex);
// destroy without mutex
virtual ~ColumnFamilyHandleImpl();
- virtual ColumnFamilyData* cfd() const { return cfd_; }
+ virtual ColumnFamilyData* cfd() const override { return cfd_; }
virtual uint32_t GetID() const override;
virtual const std::string& GetName() const override;
virtual Status GetDescriptor(ColumnFamilyDescriptor* desc) override;
virtual const Comparator* GetComparator() const override;
+ virtual ColumnFamilyHandle* CloneHandle() const override;
private:
ColumnFamilyData* cfd_;
@@ -194,6 +197,10 @@ class ColumnFamilyHandleInternal : public ColumnFamilyHandleImpl {
void SetCFD(ColumnFamilyData* _cfd) { internal_cfd_ = _cfd; }
virtual ColumnFamilyData* cfd() const override { return internal_cfd_; }
+ uint32_t GetID() const final;
+ const std::string& GetName() const final;
+ const Comparator* GetComparator() const override;
+ ColumnFamilyHandle* CloneHandle() const override;
private:
ColumnFamilyData* internal_cfd_;
@@ -371,6 +378,8 @@ class ColumnFamilyData {
// calculate the oldest log needed for the durability of this column family
uint64_t OldestLogToKeep();
+ void PrepareNewMemtableInBackground(const MutableCFOptions&);
+
// See Memtable constructor for explanation of earliest_seq param.
MemTable* ConstructNewMemtable(const MutableCFOptions& mutable_cf_options,
SequenceNumber earliest_seq);
@@ -450,6 +459,9 @@ class ColumnFamilyData {
uint64_t GetSuperVersionNumber() const {
return super_version_number_.load();
}
+ uint64_t GetSuperVersionNumberNoAtomic() const {
+ return reinterpret_cast(super_version_number_);
+ }
// will return a pointer to SuperVersion* if previous SuperVersion
// if its reference count is zero and needs deletion or nullptr if not
// As argument takes a pointer to allocated SuperVersion to enable
@@ -518,7 +530,7 @@ class ColumnFamilyData {
// user's setting. Called by background flush job.
bool ShouldPostponeFlushToRetainUDT(uint64_t max_memtable_id);
- ThreadLocalPtr* TEST_GetLocalSV() { return local_sv_.get(); }
+ ThreadLocalPtr* TEST_GetLocalSV() { return &local_sv_; }
WriteBufferManager* write_buffer_mgr() { return write_buffer_manager_; }
std::shared_ptr
GetFileMetadataCacheReservationManager() {
@@ -549,6 +561,8 @@ class ColumnFamilyData {
// of its files (if missing)
void RecoverEpochNumbers();
+ const std::string& GetDBName() const;
+
private:
friend class ColumnFamilySet;
ColumnFamilyData(uint32_t id, const std::string& name,
@@ -590,6 +604,12 @@ class ColumnFamilyData {
WriteBufferManager* write_buffer_manager_;
+ #if !defined(ROCKSDB_UNIT_TEST)
+ // precreated_memtable_list_.size() is normally 1
+ terark::fixed_circular_queue, 4> precreated_memtable_list_;
+ std::mutex precreated_memtable_mutex_;
+ #endif
+
MemTable* mem_;
MemTableList imm_;
SuperVersion* super_version_;
@@ -601,7 +621,7 @@ class ColumnFamilyData {
// Thread's local copy of SuperVersion pointer
// This needs to be destructed before mutex_
- std::unique_ptr local_sv_;
+ ThreadLocalPtr local_sv_;
// pointers for a circular linked list. we use it to support iterations over
// all column families that are alive (note: dropped column families can also
diff --git a/db/compaction/clipping_iterator.h b/db/compaction/clipping_iterator.h
index 3f50cdd9dd..db7131db74 100644
--- a/db/compaction/clipping_iterator.h
+++ b/db/compaction/clipping_iterator.h
@@ -17,14 +17,54 @@ namespace ROCKSDB_NAMESPACE {
// iterator has already performed the bounds checking, it relies on that result;
// otherwise, it performs the necessary key comparisons itself. Both bounds
// are optional.
-class ClippingIterator : public InternalIterator {
+template
+struct ClippingIterBounds;
+
+template<> struct ClippingIterBounds {
+ Slice m_start, m_end;
+ ClippingIterBounds(const Slice* start, const Slice* end)
+ : m_start(*start), m_end(*end) {
+ assert(nullptr != start);
+ assert(nullptr != end);
+ }
+ const Slice* start_() const { return &m_start; }
+ const Slice* end_() const { return &m_end; }
+};
+template<> struct ClippingIterBounds {
+ Slice m_start;
+ ClippingIterBounds(const Slice* start, const Slice* end)
+ : m_start(*start) {
+ assert(nullptr != start);
+ assert(nullptr == end);
+ }
+ const Slice* start_() const { return &m_start; }
+ const Slice* end_() const { return nullptr; }
+};
+template<> struct ClippingIterBounds {
+ Slice m_end;
+ ClippingIterBounds(const Slice* start, const Slice* end)
+ : m_end(*end) {
+ assert(nullptr == start);
+ assert(nullptr != end);
+ }
+ const Slice* start_() const { return nullptr; }
+ const Slice* end_() const { return &m_end; }
+};
+
+template
+class ClippingIterator final : public InternalIterator, ClippingIterBounds, LessCMP {
+ using bounds = ClippingIterBounds;
+ using bounds::start_;
+ using bounds::end_;
+ bool less(const Slice& x, const Slice& y) const {
+ return static_cast(*this)(x, y);
+ }
public:
ClippingIterator(InternalIterator* iter, const Slice* start, const Slice* end,
- const CompareInterface* cmp)
- : iter_(iter), start_(start), end_(end), cmp_(cmp), valid_(false) {
+ const LessCMP& cmp)
+ : bounds(start, end), LessCMP(cmp), iter_(iter), valid_(false) {
assert(iter_);
- assert(cmp_);
- assert(!start_ || !end_ || cmp_->Compare(*start_, *end_) <= 0);
+ assert(!start || !end || !less(*end, *start));
UpdateAndEnforceBounds();
}
@@ -32,71 +72,77 @@ class ClippingIterator : public InternalIterator {
bool Valid() const override { return valid_; }
void SeekToFirst() override {
- if (start_) {
- iter_->Seek(*start_);
+ if (start_()) {
+ iter_->Seek(*start_());
} else {
iter_->SeekToFirst();
}
+ UpdateValid();
UpdateAndEnforceUpperBound();
}
void SeekToLast() override {
- if (end_) {
- iter_->SeekForPrev(*end_);
+ if (end_()) {
+ iter_->SeekForPrev(*end_());
// Upper bound is exclusive, so we need a key which is strictly smaller
- if (iter_->Valid() && cmp_->Compare(iter_->key(), *end_) == 0) {
+ if (iter_->Valid() && !less(iter_->key(), *end_())) {
iter_->Prev();
}
} else {
iter_->SeekToLast();
}
+ UpdateValid();
UpdateAndEnforceLowerBound();
}
void Seek(const Slice& target) override {
- if (start_ && cmp_->Compare(target, *start_) < 0) {
- iter_->Seek(*start_);
+ if (start_() && less(target, *start_())) {
+ iter_->Seek(*start_());
+ UpdateValid();
UpdateAndEnforceUpperBound();
return;
}
- if (end_ && cmp_->Compare(target, *end_) >= 0) {
+ if (end_() && !less(target, *end_())) {
valid_ = false;
return;
}
iter_->Seek(target);
+ UpdateValid();
UpdateAndEnforceUpperBound();
}
void SeekForPrev(const Slice& target) override {
- if (start_ && cmp_->Compare(target, *start_) < 0) {
+ if (start_() && less(target, *start_())) {
valid_ = false;
return;
}
- if (end_ && cmp_->Compare(target, *end_) >= 0) {
- iter_->SeekForPrev(*end_);
+ if (end_() && !less(target, *end_())) {
+ iter_->SeekForPrev(*end_());
// Upper bound is exclusive, so we need a key which is strictly smaller
- if (iter_->Valid() && cmp_->Compare(iter_->key(), *end_) == 0) {
+ if (iter_->Valid() && !less(iter_->key(), *end_())) {
iter_->Prev();
}
+ UpdateValid();
UpdateAndEnforceLowerBound();
return;
}
iter_->SeekForPrev(target);
+ UpdateValid();
UpdateAndEnforceLowerBound();
}
void Next() override {
assert(valid_);
- iter_->Next();
+ valid_ = iter_->NextAndCheckValid();
UpdateAndEnforceUpperBound();
}
@@ -104,30 +150,28 @@ class ClippingIterator : public InternalIterator {
assert(valid_);
assert(result);
- IterateResult res;
- valid_ = iter_->NextAndGetResult(&res);
+ valid_ = iter_->NextAndGetResult(result);
- if (!valid_) {
+ if (UNLIKELY(!valid_)) {
return false;
}
- if (end_) {
- EnforceUpperBoundImpl(res.bound_check_result);
-
+ if (end_()) {
+ EnforceUpperBoundImpl(result->bound_check_result);
+ result->is_valid = valid_;
if (!valid_) {
return false;
}
}
- res.bound_check_result = IterBoundCheck::kInbound;
- *result = res;
+ result->bound_check_result = IterBoundCheck::kInbound;
return true;
}
void Prev() override {
assert(valid_);
- iter_->Prev();
+ valid_ = iter_->PrevAndCheckValid();
UpdateAndEnforceLowerBound();
}
@@ -201,18 +245,18 @@ class ClippingIterator : public InternalIterator {
}
void EnforceUpperBoundImpl(IterBoundCheck bound_check_result) {
- if (bound_check_result == IterBoundCheck::kInbound) {
+ if (UNLIKELY(bound_check_result == IterBoundCheck::kInbound)) {
return;
}
- if (bound_check_result == IterBoundCheck::kOutOfBound) {
+ if (UNLIKELY(bound_check_result == IterBoundCheck::kOutOfBound)) {
valid_ = false;
return;
}
assert(bound_check_result == IterBoundCheck::kUnknown);
- if (cmp_->Compare(key(), *end_) >= 0) {
+ if (!less(key(), *end_())) {
valid_ = false;
}
}
@@ -222,7 +266,7 @@ class ClippingIterator : public InternalIterator {
return;
}
- if (!end_) {
+ if (!end_()) {
return;
}
@@ -234,7 +278,7 @@ class ClippingIterator : public InternalIterator {
return;
}
- if (!start_) {
+ if (!start_()) {
return;
}
@@ -242,14 +286,14 @@ class ClippingIterator : public InternalIterator {
return;
}
- if (cmp_->Compare(key(), *start_) < 0) {
+ if (less(key(), *start_())) {
valid_ = false;
}
}
void AssertBounds() {
- assert(!valid_ || !start_ || cmp_->Compare(key(), *start_) >= 0);
- assert(!valid_ || !end_ || cmp_->Compare(key(), *end_) < 0);
+ assert(!valid_ || !start_() || !less(key(), *start_()));
+ assert(!valid_ || !end_() || less(key(), *end_()));
}
void UpdateAndEnforceBounds() {
@@ -260,22 +304,55 @@ class ClippingIterator : public InternalIterator {
}
void UpdateAndEnforceUpperBound() {
- UpdateValid();
EnforceUpperBound();
AssertBounds();
}
void UpdateAndEnforceLowerBound() {
- UpdateValid();
EnforceLowerBound();
AssertBounds();
}
InternalIterator* iter_;
- const Slice* start_;
- const Slice* end_;
- const CompareInterface* cmp_;
bool valid_;
};
+template
+std::unique_ptr
+MakeClippingIteratorAux(InternalIterator* iter,
+ const Slice* start, const Slice* end, LessCMP cmp) {
+ if (nullptr == start)
+ return std::make_unique >(iter, start, end, cmp);
+ else if (nullptr == end)
+ return std::make_unique >(iter, start, end, cmp);
+ else
+ return std::make_unique >(iter, start, end, cmp);
+}
+
+inline
+std::unique_ptr
+MakeClippingIterator(InternalIterator* iter,
+ const Slice* start, const Slice* end,
+ const InternalKeyComparator* cmp) {
+ if (cmp->IsForwardBytewise())
+ return MakeClippingIteratorAux(iter, start, end, {});
+ else if (cmp->IsReverseBytewise())
+ return MakeClippingIteratorAux(iter, start, end, {});
+ else
+ return MakeClippingIteratorAux(iter, start, end, {cmp});
+}
+
+inline
+std::unique_ptr
+MakeClippingIterator(InternalIterator* iter,
+ const Slice* start, const Slice* end,
+ const Comparator* cmp) {
+ if (cmp->IsForwardBytewise())
+ return MakeClippingIteratorAux(iter, start, end, {});
+ else if (cmp->IsReverseBytewise())
+ return MakeClippingIteratorAux(iter, start, end, {});
+ else
+ return MakeClippingIteratorAux(iter, start, end, {cmp});
+}
+
} // namespace ROCKSDB_NAMESPACE
diff --git a/db/compaction/clipping_iterator_test.cc b/db/compaction/clipping_iterator_test.cc
index b2b1670489..31a0a4e00b 100644
--- a/db/compaction/clipping_iterator_test.cc
+++ b/db/compaction/clipping_iterator_test.cc
@@ -38,12 +38,14 @@ class BoundsCheckingVectorIterator : public VectorIterator {
Next();
if (!Valid()) {
+ result->is_valid = false;
return false;
}
- result->key = key();
+ result->SetKey(this->key());
result->bound_check_result = UpperBoundCheckResult();
result->value_prepared = true;
+ result->is_valid = true;
return true;
}
@@ -109,7 +111,8 @@ TEST_P(ClippingIteratorTest, Clip) {
&end, BytewiseComparator())
: new VectorIterator(input_keys, input_values, BytewiseComparator()));
- ClippingIterator clip(input.get(), &start, &end, BytewiseComparator());
+ auto p_clip = MakeClippingIterator(input.get(), &start, &end, BytewiseComparator());
+ auto& clip = *p_clip;
// The range the clipping iterator should return values from. This is
// essentially the intersection of the input range [1, 4) and the clipping
@@ -168,7 +171,7 @@ TEST_P(ClippingIteratorTest, Clip) {
for (size_t i = data_start_idx + 1; i < data_end_idx; ++i) {
IterateResult result;
ASSERT_TRUE(clip.NextAndGetResult(&result));
- ASSERT_EQ(result.key, keys[i]);
+ ASSERT_EQ(result.key(), keys[i]);
ASSERT_EQ(result.bound_check_result, IterBoundCheck::kInbound);
ASSERT_TRUE(clip.Valid());
ASSERT_EQ(clip.key(), keys[i]);
diff --git a/db/compaction/compaction.cc b/db/compaction/compaction.cc
index bbab8f79fb..516940f2b7 100644
--- a/db/compaction/compaction.cc
+++ b/db/compaction/compaction.cc
@@ -24,8 +24,10 @@ namespace ROCKSDB_NAMESPACE {
const uint64_t kRangeTombstoneSentinel =
PackSequenceAndType(kMaxSequenceNumber, kTypeRangeDeletion);
-int sstableKeyCompare(const Comparator* uc, const Slice& a, const Slice& b) {
- auto c = uc->CompareWithoutTimestamp(ExtractUserKey(a), ExtractUserKey(b));
+template
+ROCKSDB_FLATTEN
+int sstableKeyCompare(CmpNoTS ucmp, const Slice& a, const Slice& b) {
+ auto c = ucmp(ExtractUserKey(a), ExtractUserKey(b));
if (c != 0) {
return c;
}
@@ -40,27 +42,26 @@ int sstableKeyCompare(const Comparator* uc, const Slice& a, const Slice& b) {
}
return 0;
}
+#define sstableKeyCompareInstantiate(CmpNoTS) \
+ template int sstableKeyCompare(CmpNoTS, const Slice&, const Slice&)
-int sstableKeyCompare(const Comparator* user_cmp, const InternalKey* a,
- const InternalKey& b) {
- if (a == nullptr) {
- return -1;
- }
- return sstableKeyCompare(user_cmp, *a, b);
-}
+sstableKeyCompareInstantiate(ForwardBytewiseCompareUserKeyNoTS);
+sstableKeyCompareInstantiate(ReverseBytewiseCompareUserKeyNoTS);
+sstableKeyCompareInstantiate(VirtualFunctionCompareUserKeyNoTS);
-int sstableKeyCompare(const Comparator* user_cmp, const InternalKey& a,
- const InternalKey* b) {
- if (b == nullptr) {
- return -1;
+uint64_t TotalFileSize(const std::vector& files) {
+ uint64_t sum = 0;
+ for (size_t i = 0; i < files.size() && files[i]; i++) {
+ sum += files[i]->fd.GetFileSize();
}
- return sstableKeyCompare(user_cmp, a, *b);
+ return sum;
}
-uint64_t TotalFileSize(const std::vector& files) {
+uint64_t TotalFileRawKV(const std::vector& files) {
uint64_t sum = 0;
for (size_t i = 0; i < files.size() && files[i]; i++) {
- sum += files[i]->fd.GetFileSize();
+ if (auto reader = files[i]->fd.table_reader)
+ sum += reader->GetTableProperties()->raw_size();
}
return sum;
}
@@ -342,12 +343,16 @@ Compaction::Compaction(
? Compaction::kInvalidLevel
: EvaluatePenultimateLevel(vstorage, immutable_options_,
start_level_, output_level_)) {
+ is_compaction_woker_ = IsCompactionWorker(); // preload to speed up
MarkFilesBeingCompacted(true);
if (is_manual_compaction_) {
compaction_reason_ = CompactionReason::kManualCompaction;
}
if (max_subcompactions_ == 0) {
- max_subcompactions_ = _mutable_db_options.max_subcompactions;
+ if (output_level_ > 0 && 0 == start_level_ && _mutable_db_options.max_level1_subcompactions)
+ max_subcompactions_ = _mutable_db_options.max_level1_subcompactions;
+ else
+ max_subcompactions_ = _mutable_db_options.max_subcompactions;
}
// for the non-bottommost levels, it tries to build files match the target
@@ -378,6 +383,7 @@ Compaction::Compaction(
// Every compaction regardless of any compaction reason may respect the
// existing compact cursor in the output level to split output files
output_split_key_ = nullptr;
+#if defined(ROCKSDB_UNIT_TEST)
if (immutable_options_.compaction_style == kCompactionStyleLevel &&
immutable_options_.compaction_pri == kRoundRobin) {
const InternalKey* cursor =
@@ -395,6 +401,7 @@ Compaction::Compaction(
}
}
}
+#endif
PopulatePenultimateLevelOutputRange();
}
@@ -516,6 +523,10 @@ bool Compaction::InputCompressionMatchesOutput() const {
return matches;
}
+bool TableFactory::InputCompressionMatchesOutput(const Compaction* c) const {
+ return c->InputCompressionMatchesOutput();
+}
+
bool Compaction::IsTrivialMove() const {
// Avoid a move if there is lots of overlapping grandparent data.
// Otherwise, the move could create a parent file that will require
@@ -550,6 +561,17 @@ bool Compaction::IsTrivialMove() const {
return false;
}
+#if !defined(ROCKSDB_UNIT_TEST) // ToplingDB specific
+ if (kCompactionStyleLevel == immutable_options_.compaction_style) {
+ auto& cfo = mutable_cf_options_;
+ if (1 == output_level_ &&
+ immutable_options_.compaction_executor_factory &&
+ cfo.write_buffer_size > cfo.target_file_size_base * 3/2) {
+ return false;
+ }
+ }
+#endif
+
// Used in universal compaction, where trivial move can be done if the
// input files are non overlapping
if ((mutable_cf_options_.compaction_options_universal.allow_trivial_move) &&
@@ -560,7 +582,7 @@ bool Compaction::IsTrivialMove() const {
if (!(start_level_ != output_level_ && num_input_levels() == 1 &&
input(0, 0)->fd.GetPathId() == output_path_id() &&
- InputCompressionMatchesOutput())) {
+ immutable_options_.table_factory->InputCompressionMatchesOutput(this))) {
return false;
}
@@ -611,6 +633,8 @@ bool Compaction::KeyNotExistsBeyondOutputLevel(
assert(level_ptrs->size() == static_cast(number_levels_));
if (bottommost_level_) {
return true;
+ } else if (is_compaction_woker_) {
+ return false;
} else if (output_level_ != 0 &&
cfd_->ioptions()->compaction_style == kCompactionStyleLevel) {
// Maybe use binary search to find right entry instead of linear search?
@@ -829,6 +853,7 @@ std::unique_ptr Compaction::CreateCompactionFilter() const {
context.input_start_level = start_level_;
context.column_family_id = cfd_->GetID();
context.reason = TableFileCreationReason::kCompaction;
+<<<<<<< HEAD
context.input_table_properties = GetInputTableProperties();
if (context.input_table_properties.empty()) {
ROCKS_LOG_WARN(
@@ -837,6 +862,9 @@ std::unique_ptr Compaction::CreateCompactionFilter() const {
"for compaction.");
}
+=======
+ context.smallest_seqno = GetSmallestSeqno();
+>>>>>>> sideplugin-8.04.0-2023-06-20-2926e071
return cfd_->ioptions()->compaction_filter_factory->CreateCompactionFilter(
context);
}
@@ -852,6 +880,7 @@ std::unique_ptr Compaction::CreateSstPartitioner() const {
context.output_level = output_level_;
context.smallest_user_key = smallest_user_key_;
context.largest_user_key = largest_user_key_;
+ context.target_output_file_size = target_output_file_size_;
return immutable_options_.sst_partitioner_factory->CreatePartitioner(context);
}
@@ -864,12 +893,14 @@ bool Compaction::ShouldFormSubcompactions() const {
return false;
}
+#if defined(ROCKSDB_UNIT_TEST)
// Round-Robin pri under leveled compaction allows subcompactions by default
// and the number of subcompactions can be larger than max_subcompactions_
if (cfd_->ioptions()->compaction_pri == kRoundRobin &&
cfd_->ioptions()->compaction_style == kCompactionStyleLevel) {
return output_level_ > 0;
}
+#endif
if (max_subcompactions_ <= 1) {
return false;
@@ -985,4 +1016,14 @@ int Compaction::EvaluatePenultimateLevel(
return penultimate_level;
}
+uint64_t Compaction::GetSmallestSeqno() const {
+ uint64_t smallest_seqno = UINT64_MAX;
+ for (auto& eachlevel : inputs_) {
+ for (auto& eachfile : eachlevel.files)
+ if (smallest_seqno > eachfile->fd.smallest_seqno)
+ smallest_seqno = eachfile->fd.smallest_seqno;
+ }
+ return smallest_seqno;
+}
+
} // namespace ROCKSDB_NAMESPACE
diff --git a/db/compaction/compaction.h b/db/compaction/compaction.h
index 50c75f70b2..2ba7e70053 100644
--- a/db/compaction/compaction.h
+++ b/db/compaction/compaction.h
@@ -31,23 +31,39 @@ namespace ROCKSDB_NAMESPACE {
// that key never appears in the database. We don't want adjacent sstables to
// be considered overlapping if they are separated by the range tombstone
// sentinel.
-int sstableKeyCompare(const Comparator* user_cmp, const Slice&, const Slice&);
-inline int sstableKeyCompare(const Comparator* user_cmp, const Slice& a,
- const InternalKey& b) {
- return sstableKeyCompare(user_cmp, a, b.Encode());
+
+template
+extern int sstableKeyCompare(CmpNoTS, const Slice& a, const Slice& b);
+inline int
+sstableKeyCompare(const Comparator* uc, const Slice& a, const Slice& b) {
+ return sstableKeyCompare(VirtualFunctionCompareUserKeyNoTS{uc}, a, b);
+}
+template inline int
+sstableKeyCompare(CmpNoTS cmp, const Slice& a, const InternalKey& b) {
+ return sstableKeyCompare(cmp, a, b.Encode());
+}
+template inline int
+sstableKeyCompare(CmpNoTS cmp, const InternalKey& a, const Slice& b) {
+ return sstableKeyCompare(cmp, a.Encode(), b);
+}
+template inline int
+sstableKeyCompare(CmpNoTS cmp, const InternalKey& a, const InternalKey& b) {
+ return sstableKeyCompare(cmp, a.Encode(), b.Encode());
}
-inline int sstableKeyCompare(const Comparator* user_cmp, const InternalKey& a,
- const Slice& b) {
- return sstableKeyCompare(user_cmp, a.Encode(), b);
+template inline int
+sstableKeyCompare(CmpNoTS cmp, const InternalKey* a, const InternalKey& b) {
+ if (a == nullptr)
+ return -1;
+ else
+ return sstableKeyCompare(cmp, *a, b);
}
-inline int sstableKeyCompare(const Comparator* user_cmp, const InternalKey& a,
- const InternalKey& b) {
- return sstableKeyCompare(user_cmp, a.Encode(), b.Encode());
+template inline int
+sstableKeyCompare(CmpNoTS cmp, const InternalKey& a, const InternalKey* b) {
+ if (b == nullptr)
+ return -1;
+ else
+ return sstableKeyCompare(cmp, a, *b);
}
-int sstableKeyCompare(const Comparator* user_cmp, const InternalKey* a,
- const InternalKey& b);
-int sstableKeyCompare(const Comparator* user_cmp, const InternalKey& a,
- const InternalKey* b);
// An AtomicCompactionUnitBoundary represents a range of keys [smallest,
// largest] that exactly spans one ore more neighbouring SSTs on the same
@@ -173,7 +189,7 @@ class Compaction {
return &inputs_[compaction_input_level].files;
}
- const std::vector* inputs() { return &inputs_; }
+ const std::vector* inputs() const { return &inputs_; }
// Returns the LevelFilesBrief of the specified compaction input level.
const LevelFilesBrief* input_levels(size_t compaction_input_level) const {
@@ -197,6 +213,11 @@ class Compaction {
// Whether need to write output file to second DB path.
uint32_t output_path_id() const { return output_path_id_; }
+ const DbPath& output_path() const {
+ ROCKSDB_VERIFY_LT(output_path_id_, immutable_options_.cf_paths.size());
+ return immutable_options_.cf_paths[output_path_id_];
+ }
+
// Is this a trivial compaction that can be implemented by just
// moving a single input file to the next level (no merging or splitting)
bool IsTrivialMove() const;
@@ -243,6 +264,8 @@ class Compaction {
// Is this compaction creating a file in the bottom most level?
bool bottommost_level() const { return bottommost_level_; }
+ void set_bottommost_level(bool v) { bottommost_level_ = v; }
+
// Is the compaction compact to the last level
bool is_last_level() const {
return output_level_ == immutable_options_.num_levels - 1;
@@ -415,6 +438,7 @@ class Compaction {
bool ShouldNotifyOnCompactionCompleted() const {
return notify_on_compaction_completion_;
}
+ uint64_t GetSmallestSeqno() const;
static constexpr int kInvalidLevel = -1;
@@ -504,6 +528,7 @@ class Compaction {
// logic might pick a subset of the files that aren't overlapping. if
// that is the case, set the value to false. Otherwise, set it true.
bool l0_files_might_overlap_;
+ bool is_compaction_woker_;
// Compaction input files organized by level. Constant after construction
const std::vector inputs_;
@@ -517,7 +542,7 @@ class Compaction {
const double score_; // score that was used to pick this compaction.
// Is this compaction creating a file in the bottom most level?
- const bool bottommost_level_;
+ bool bottommost_level_;
// Does this compaction include all sst files?
const bool is_full_compaction_;
@@ -533,6 +558,7 @@ class Compaction {
// Does input compression match the output compression?
bool InputCompressionMatchesOutput() const;
+ friend class TableFactory; // use InputCompressionMatchesOutput
TablePropertiesCollection input_table_properties_;
TablePropertiesCollection output_table_properties_;
@@ -597,4 +623,7 @@ struct PerKeyPlacementContext {
// Return sum of sizes of all files in `files`.
extern uint64_t TotalFileSize(const std::vector& files);
+// Return sum of raw kv sizes of all files in `files`.
+extern uint64_t TotalFileRawKV(const std::vector& files);
+
} // namespace ROCKSDB_NAMESPACE
diff --git a/db/compaction/compaction_executor.cc b/db/compaction/compaction_executor.cc
new file mode 100644
index 0000000000..727acc3beb
--- /dev/null
+++ b/db/compaction/compaction_executor.cc
@@ -0,0 +1,332 @@
+//
+// Created by leipeng on 2021/1/11.
+//
+
+#include "compaction_executor.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+CompactionParams::CompactionParams() {
+ is_deserialized = false;
+}
+CompactionParams::~CompactionParams() {
+ if (is_deserialized) {
+ ROCKSDB_VERIFY(IsCompactionWorker());
+ /*
+ for (auto& x : *inputs) {
+ for (auto& e : x.atomic_compaction_unit_boundaries) {
+ delete e.smallest;
+ delete e.largest;
+ }
+ }
+ */
+ if (grandparents) {
+ for (auto meta : *grandparents) {
+ delete meta;
+ }
+ delete grandparents;
+ }
+ if (inputs) {
+ for (auto& level_files : *inputs) {
+ for (auto meta : level_files.files)
+ delete meta;
+ }
+ delete inputs;
+ }
+ delete existing_snapshots;
+ //delete compaction_job_stats;
+ }
+ else {
+ //ROCKSDB_VERIFY(!IsCompactionWorker());
+ }
+}
+
+#if defined(_MSC_VER)
+static std::string html_user_key_decode(const CompactionParams&, Slice uk) {
+ return uk.ToString(true);
+}
+#else
+std::string __attribute__((weak))
+CompactionParams_html_user_key_decode(const CompactionParams&, Slice);
+static std::string html_user_key_decode(const CompactionParams& cp, Slice uk) {
+ if (CompactionParams_html_user_key_decode)
+ return CompactionParams_html_user_key_decode(cp, uk);
+ else
+ return uk.ToString(true);
+}
+#endif
+
+static void PrintVersionSetSerDe(FILE* fp, const VersionSetSerDe& v) {
+ fprintf(fp, "VersionSetSerDe\n");
+ fprintf(fp, " last_sequence = %zd, "
+ "last_allocated_sequence = %zd, "
+ "last_published_sequence = %zd\n",
+ size_t(v.last_sequence),
+ size_t(v.last_allocated_sequence),
+ size_t(v.last_published_sequence));
+ fprintf(fp, " next_file_number = %zd, "
+ "min_log_number_to_keep_2pc = %zd, "
+ "manifest_file_number = %zd, "
+ "options_file_number = %zd, "
+ "prev_log_number = %zd, "
+ "current_version_number = %zd\n",
+ size_t(v.next_file_number),
+ #if ROCKSDB_MAJOR < 7
+ size_t(v.min_log_number_to_keep_2pc),
+ #else
+ size_t(v.min_log_number_to_keep),
+ #endif
+ size_t(v.manifest_file_number),
+ size_t(v.options_file_number),
+ size_t(v.prev_log_number),
+ size_t(v.current_version_number));
+}
+static void PrintFileMetaData(const CompactionParams& cp,
+ FILE* fp, const FileMetaData* f) {
+ Slice temperature = enum_name(f->temperature);
+ std::string lo = html_user_key_decode(cp, f->smallest.user_key());
+ std::string hi = html_user_key_decode(cp, f->largest.user_key());
+ fprintf(fp,
+ " %06zd.sst : entries = %zd, del = %zd, rks = %zd, rvs = %zd, "
+ "fsize = %zd : %zd, temp = %.*s, seq = %zd : %zd, rng = %.*s : %.*s\n",
+ size_t(f->fd.GetNumber()),
+ size_t(f->num_entries), size_t(f->num_deletions),
+ size_t(f->raw_key_size), size_t(f->raw_value_size),
+ size_t(f->fd.file_size), size_t(f->compensated_file_size),
+ int(temperature.size_), temperature.data_,
+ size_t(f->fd.smallest_seqno), size_t(f->fd.largest_seqno),
+ int(lo.size()), lo.data(), int(hi.size()), hi.data());
+}
+
+std::string CompactionParams::DebugString() const {
+ size_t mem_len = 0;
+ char* mem_buf = nullptr;
+ FILE* fp = open_memstream(&mem_buf, &mem_len);
+ fprintf(fp, "job_id = %d, output_level = %d, dbname = %s, cfname = %s\n",
+ job_id, output_level, dbname.c_str(), cf_name.c_str());
+ fprintf(fp, "bottommost_level = %d, compaction_reason = %s\n",
+ bottommost_level, enum_cstr(compaction_reason));
+ fprintf(fp, "smallest_user_key = %s\n", html_user_key_decode(*this, smallest_user_key).c_str());
+ fprintf(fp, "llargest_user_key = %s\n", html_user_key_decode(*this, largest_user_key).c_str());
+ for (size_t i = 0; i < inputs->size(); ++i) {
+ auto& l = inputs->at(i);
+ fprintf(fp, "inputs.size = %zd : %zd : level = %d, size = %3zd\n",
+ inputs->size(), i, l.level, l.size());
+ for (auto fmd : l.files) {
+ PrintFileMetaData(*this, fp, fmd);
+ }
+ }
+ if (grandparents) {
+ fprintf(fp, "grandparents.size = %zd\n", grandparents->size());
+ for (size_t i = 0; i < grandparents->size(); ++i) {
+ FileMetaData* fmd = grandparents->at(i);
+ PrintFileMetaData(*this, fp, fmd);
+ }
+ }
+ else {
+ fprintf(fp, "grandparents = nullptr\n");
+ }
+ if (existing_snapshots) {
+ fprintf(fp, "existing_snapshots.size = %zd\n", existing_snapshots->size());
+ }
+ else {
+ fprintf(fp, "existing_snapshots = nullptr\n");
+ }
+ fprintf(fp, "level_compaction_dynamic_file_size = %s",
+ level_compaction_dynamic_file_size ? "true" : "false");
+ PrintVersionSetSerDe(fp, version_set);
+ fclose(fp);
+ std::string result(mem_buf, mem_len);
+ free(mem_buf);
+ return result;
+}
+
+// res[0] : raw
+// res[1] : zip
+void CompactionParams::InputBytes(size_t* res) const {
+ size_t raw = 0, zip = 0;
+ for (auto& eachlevel : *inputs) {
+ for (auto& eachfile : eachlevel.files) {
+ zip += eachfile->fd.file_size;
+ raw += eachfile->raw_key_size + eachfile->raw_value_size;
+ }
+ }
+ res[0] = raw;
+ res[1] = zip;
+}
+
+CompactionResults::CompactionResults() {
+ curl_time_usec = 0;
+ work_time_usec = 0;
+ mount_time_usec = 0;
+ prepare_time_usec = 0;
+ waiting_time_usec = 0;
+ output_index_size = 0;
+ output_data_size = 0;
+}
+CompactionResults::~CompactionResults() {}
+
+struct MyVersionSet : VersionSet {
+ void From(const VersionSetSerDe& version_set) {
+ next_file_number_ = version_set.next_file_number;
+ last_sequence_ = version_set.last_sequence;
+ // below are not necessary fields, but we serialize it for
+ // for completeness debugging
+ last_allocated_sequence_ = version_set.last_allocated_sequence;
+ last_published_sequence_ = version_set.last_published_sequence;
+ #if ROCKSDB_MAJOR < 7
+ min_log_number_to_keep_2pc_ = version_set.min_log_number_to_keep_2pc;
+ #else
+ min_log_number_to_keep_ = version_set.min_log_number_to_keep;
+ #endif
+ manifest_file_number_ = version_set.manifest_file_number;
+ options_file_number_ = version_set.options_file_number;
+ //pending_manifest_file_number_ is temporal on running, do NOT serilize!
+ //pending_manifest_file_number_ = version_set.pending_manifest_file_number;
+ prev_log_number_ = version_set.prev_log_number;
+ current_version_number_ = version_set.current_version_number;
+ }
+ void To(VersionSetSerDe& version_set) const {
+ version_set.next_file_number = next_file_number_;
+ version_set.last_sequence = last_sequence_;
+ // below are not necessary fields, but we serialize it for
+ // for completeness debugging
+ version_set.last_allocated_sequence = last_allocated_sequence_;
+ version_set.last_published_sequence = last_published_sequence_;
+ #if ROCKSDB_MAJOR < 7
+ version_set.min_log_number_to_keep_2pc = min_log_number_to_keep_2pc_;
+ #else
+ version_set.min_log_number_to_keep = min_log_number_to_keep_;
+ #endif
+ version_set.manifest_file_number = manifest_file_number_;
+ version_set.options_file_number = options_file_number_;
+ //pending_manifest_file_number_ is temporal on running, do NOT serilize!
+ //version_set.pending_manifest_file_number = pending_manifest_file_number_;
+ version_set.prev_log_number = prev_log_number_;
+ version_set.current_version_number = current_version_number_;
+ }
+};
+void VersionSetSerDe::From(const VersionSet* vs) {
+ static_cast(vs)->To(*this); // NOLINT
+}
+void VersionSetSerDe::To(VersionSet* vs) const {
+ static_cast(vs)->From(*this); // NOLINT
+}
+
+CompactionExecutor::~CompactionExecutor() = default;
+CompactionExecutorFactory::~CompactionExecutorFactory() = default;
+
+std::string CompactionExecutorFactory::JobUrl(const std::string&, int, int) const {
+ return std::string(); // empty string
+}
+
+static bool g_is_compaction_worker = false;
+bool IsCompactionWorker() {
+ return g_is_compaction_worker;
+}
+void SetAsCompactionWorker() {
+ g_is_compaction_worker = true;
+}
+
+/////////////////////////////////////////////////////////////////////////////
+std::string GetDirFromEnv(const char* name, const char* Default) {
+ const char* dir = getenv(name);
+ if (nullptr == dir) {
+ ROCKSDB_VERIFY(nullptr != Default);
+ dir = Default;
+ }
+ size_t dir_name_len = strlen(dir);
+ ROCKSDB_VERIFY(dir_name_len > 0);
+ while (dir_name_len && '/' == dir[dir_name_len-1]) {
+ dir_name_len--;
+ }
+ ROCKSDB_VERIFY(dir_name_len > 0);
+ return std::string(dir, dir_name_len);
+}
+
+bool ReplacePrefix(Slice Old, Slice New, Slice str, std::string* res) {
+ ROCKSDB_VERIFY(Old.size_ > 0);
+ ROCKSDB_VERIFY(New.size_ > 0);
+ while (Old.size_ && Old.data_[Old.size_-1] == '/') {
+ --Old.size_;
+ }
+ while (New.size_ && New.data_[New.size_-1] == '/') {
+ --New.size_;
+ }
+ ROCKSDB_VERIFY(Old.size_ > 0);
+ ROCKSDB_VERIFY(New.size_ > 0);
+ if (str.starts_with(Old)) {
+ size_t suffixLen = str.size_ - Old.size_;
+ res->reserve(New.size_ + suffixLen);
+ res->assign(New.data_, New.size_);
+ res->append(str.data_ + Old.size_, suffixLen);
+ return true;
+ }
+ return false;
+}
+
+std::string ReplacePrefix(Slice Old, Slice New, Slice str) {
+ std::string res;
+ if (ReplacePrefix(Old, New, str, &res)) {
+ return res;
+ }
+ ROCKSDB_DIE("str = '%.*s' does not start with Old='%.*s'",
+ int(str.size()), str.data(), int(Old.size()), Old.data());
+}
+
+void ReplaceAll(std::string& str, Slice from, Slice to) {
+ if (from.empty()) return;
+ size_t start_pos = 0;
+ while ((start_pos = str.find(from.data(), start_pos)) != std::string::npos) {
+ str.replace(start_pos, from.size(), to.data(), to.size());
+ start_pos += to.size();
+ }
+}
+std::string ReplaceAll(Slice str, Slice from, Slice to) {
+ std::string tmp(str.data(), str.size());
+ ReplaceAll(tmp, from, to);
+ return tmp;
+}
+std::string MakePath(std::string dir, Slice sub) {
+ while (!dir.empty() && '/' == dir.back()) {
+ dir.pop_back();
+ }
+ dir.reserve(dir.size() + 1 + sub.size());
+ ROCKSDB_VERIFY(!sub.empty());
+ while (!sub.empty() && '/' == sub[0]) {
+ sub.remove_prefix(1);
+ }
+ ROCKSDB_VERIFY(!sub.empty());
+ dir.push_back('/');
+ dir.append(sub.data(), sub.size());
+ return dir;
+}
+
+std::string& AppendJobID(std::string& dir, int job_id) {
+ while (!dir.empty() && '/' == dir.back()) {
+ dir.pop_back();
+ }
+ char buf[32];
+ dir.append(buf, snprintf(buf, sizeof(buf), "/job-%05d", job_id));
+ return dir;
+}
+std::string CatJobID(const std::string& dir, int job_id) {
+ std::string output_path = dir;
+ AppendJobID(output_path, job_id);
+ return output_path;
+}
+std::string& AppendAttempt(std::string& dir, int attempt) {
+ while (!dir.empty() && '/' == dir.back()) {
+ dir.pop_back();
+ }
+ char buf[32];
+ dir.append(buf, snprintf(buf, sizeof(buf), "/att-%02d", attempt));
+ return dir;
+}
+std::string CatAttempt(const std::string& dir, int attempt) {
+ std::string output_path = dir;
+ AppendAttempt(output_path, attempt);
+ return output_path;
+}
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/db/compaction/compaction_executor.h b/db/compaction/compaction_executor.h
new file mode 100644
index 0000000000..47c32969c5
--- /dev/null
+++ b/db/compaction/compaction_executor.h
@@ -0,0 +1,192 @@
+//
+// Created by leipeng on 2021/1/11.
+//
+#pragma once
+#include "compaction_job.h"
+
+namespace ROCKSDB_NAMESPACE {
+
+struct ObjectRpcParam {
+ std::string clazz;
+ std::string params; // construction json params
+ typedef std::function serde_fn_t;
+ serde_fn_t serde;
+};
+struct VersionSetSerDe {
+ uint64_t last_sequence;
+ uint64_t last_allocated_sequence;
+ uint64_t last_published_sequence;
+ uint64_t next_file_number;
+ #if ROCKSDB_MAJOR < 7
+ uint64_t min_log_number_to_keep_2pc;
+ #else
+ uint64_t min_log_number_to_keep;
+ #endif
+ uint64_t manifest_file_number;
+ uint64_t options_file_number;
+ //uint64_t pending_manifest_file_number;
+ uint64_t prev_log_number;
+ uint64_t current_version_number;
+ void From(const VersionSet*);
+ void To(VersionSet*) const;
+};
+struct CompactionParams {
+ CompactionParams(const CompactionParams&) = delete;
+ CompactionParams& operator=(const CompactionParams&) = delete;
+ CompactionParams();
+ ~CompactionParams();
+ int job_id;
+ int num_levels;
+ int output_level;
+ uint32_t cf_id;
+ std::string cf_name;
+ const std::vector* inputs = nullptr;
+ VersionSetSerDe version_set;
+ uint64_t target_file_size;
+ uint64_t max_compaction_bytes;
+
+ // we add a dedicated path to compaction worker's cf_path as
+ // output path, thus reduce changes to the existing rocksdb code.
+ // the output_path_id should be the last elem of cf_paths, so it
+ // needs not the field output_path_id.
+ //uint32_t output_path_id; // point to the extra cf_path
+ //std::string output_path; // will append to cfopt.cf_paths on remote node?
+ std::vector cf_paths;
+
+ uint32_t max_subcompactions; // num_threads
+ CompressionType compression;
+ CompressionOptions compression_opts;
+ const std::vector* grandparents = nullptr;
+ double score;
+ bool manual_compaction;
+ bool deletion_compaction;
+ InfoLogLevel compaction_log_level;
+ CompactionReason compaction_reason;
+
+ //VersionSet* version_set;
+ SequenceNumber preserve_deletes_seqnum;
+ const std::vector* existing_snapshots = nullptr;
+ SequenceNumber smallest_seqno;
+ SequenceNumber earliest_write_conflict_snapshot;
+ bool paranoid_file_checks;
+ uint32_t code_version;
+ std::string code_githash;
+ std::string hoster_root;
+ std::string instance_name;
+ std::string dbname;
+ std::string db_id;
+ std::string db_session_id;
+ std::string full_history_ts_low;
+ //CompactionJobStats* compaction_job_stats = nullptr; // this is out param
+ //SnapshotChecker* snapshot_checker; // not used
+ //FSDirectory* db_directory;
+ //FSDirectory* output_directory;
+ //FSDirectory* blob_output_directory;
+
+ std::string smallest_user_key; // serialization must before
+ std::string largest_user_key; // ObjectRpcParam fields
+ //ObjectRpcParam compaction_filter; // don't use compaction_filter
+ ObjectRpcParam compaction_filter_factory; // always use
+ ObjectRpcParam merge_operator;
+ ObjectRpcParam user_comparator;
+ ObjectRpcParam table_factory;
+ ObjectRpcParam prefix_extractor;
+ ObjectRpcParam sst_partitioner_factory;
+ ObjectRpcParam html_user_key_coder;
+
+ //bool skip_filters;
+ bool allow_ingest_behind;
+ bool preserve_deletes;
+ bool bottommost_level;
+ bool is_deserialized;
+ bool level_compaction_dynamic_file_size;
+ CompactionStyle compaction_style;
+ CompactionPri compaction_pri;
+ std::vector listeners;
+ std::vector table_properties_collector_factories;
+ std::string extensible_js_data;
+
+ // CompactionFilterFactory ... can have individual serde files
+ mutable std::vector extra_serde_files;
+ Logger* info_log = nullptr; // do not serialize, just for running process
+ mutable class UserKeyCoder* p_html_user_key_coder = nullptr;
+ const std::atomic* shutting_down = nullptr; // do not serialize
+
+ std::string DebugString() const;
+ void InputBytes(size_t* res) const;
+};
+
+struct CompactionResults {
+ CompactionResults(const CompactionResults&) = delete;
+ CompactionResults& operator=(const CompactionResults&) = delete;
+ CompactionResults();
+ ~CompactionResults();
+ struct FileMinMeta {
+ uint64_t file_number;
+ uint64_t file_size;
+ uint64_t smallest_seqno;
+ uint64_t largest_seqno;
+ InternalKey smallest_ikey;
+ InternalKey largest_ikey;
+ bool marked_for_compaction;
+ };
+ // collect remote statistics
+ struct RawStatistics {
+ uint64_t tickers[INTERNAL_TICKER_ENUM_MAX] = {0};
+ HistogramStat histograms[INTERNAL_HISTOGRAM_ENUM_MAX];
+ };
+
+ std::string output_dir;
+ std::vector > output_files;
+ InternalStats::CompactionStats compaction_stats;
+ CompactionJobStats job_stats;
+ RawStatistics statistics;
+ Status status;
+ size_t curl_time_usec; // set by CompactionExecutor, not worker
+ size_t work_time_usec;
+ size_t mount_time_usec; // mount nfs
+ size_t prepare_time_usec; // open nfs params/results
+ size_t waiting_time_usec; // wait in work queue
+
+ uint64_t output_index_size; // not serialized, just for DB side convenient
+ uint64_t output_data_size; // not serialized, just for DB side convenient
+
+ size_t all_time_usec() const {
+ return curl_time_usec + mount_time_usec + prepare_time_usec + work_time_usec;
+ }
+};
+
+class CompactionExecutor {
+ public:
+ virtual ~CompactionExecutor();
+ virtual void SetParams(CompactionParams*, const Compaction*) = 0;
+ virtual Status CopyOneFile(const std::string& src, const std::string& dst, off_t fsize) = 0;
+ virtual Status RenameFile(const std::string& src, const std::string& dst, off_t fsize) = 0;
+ virtual Status Execute(const CompactionParams&, CompactionResults*) = 0;
+ virtual void CleanFiles(const CompactionParams&, const CompactionResults&) = 0;
+};
+
+class CompactionExecutorFactory {
+ public:
+ virtual ~CompactionExecutorFactory();
+ virtual bool ShouldRunLocal(const Compaction*) const = 0;
+ virtual bool AllowFallbackToLocal() const = 0;
+ virtual CompactionExecutor* NewExecutor(const Compaction*) const = 0;
+ virtual const char* Name() const = 0;
+ virtual std::string JobUrl(const std::string& dbname, int job_id, int attempt) const;
+};
+
+/////////////////////////////////////////////////////////////////////////////
+
+std::string GetDirFromEnv(const char* name, const char* Default = nullptr);
+bool ReplacePrefix(Slice Old, Slice New, Slice str, std::string* res);
+std::string ReplacePrefix(Slice Old, Slice New, Slice str);
+void ReplaceAll(std::string& str, Slice from, Slice to);
+std::string ReplaceAll(Slice str, Slice from, Slice to);
+std::string MakePath(std::string dir, Slice sub);
+std::string& AppendJobID(std::string& path, int job_id);
+std::string CatJobID(const std::string& path, int job_id);
+std::string& AppendAttempt(std::string& path, int attempt);
+std::string CatAttempt(const std::string& path, int attempt);
+
+} // namespace ROCKSDB_NAMESPACE
diff --git a/db/compaction/compaction_iterator.cc b/db/compaction/compaction_iterator.cc
index 85d1c039bd..4fc614de4d 100644
--- a/db/compaction/compaction_iterator.cc
+++ b/db/compaction/compaction_iterator.cc
@@ -79,6 +79,8 @@ CompactionIterator::CompactionIterator(
clock_(env_->GetSystemClock().get()),
report_detailed_time_(report_detailed_time),
expect_valid_internal_key_(expect_valid_internal_key),
+ allow_ingest_behind_(compaction && compaction->allow_ingest_behind()),
+ supports_per_key_placement_(compaction && compaction->SupportsPerKeyPlacement()),
range_del_agg_(range_del_agg),
blob_file_builder_(blob_file_builder),
compaction_(std::move(compaction)),
@@ -117,11 +119,17 @@ CompactionIterator::CompactionIterator(
if (compaction_ != nullptr) {
level_ptrs_ = std::vector(compaction_->number_levels(), 0);
+ if (auto c = compaction_->real_compaction()) {
+ if (level_ >= 0 && level_ < c->mutable_cf_options()->min_filter_level) {
+ compaction_filter_ = nullptr; // ignore compaction_filter_
+ }
+ }
}
#ifndef NDEBUG
// findEarliestVisibleSnapshot assumes this ordering.
for (size_t i = 1; i < snapshots_->size(); ++i) {
- assert(snapshots_->at(i - 1) < snapshots_->at(i));
+ ROCKSDB_VERIFY_F(snapshots_->at(i - 1) < snapshots_->at(i),
+ "[%zd]: %zd %zd", i, snapshots_->at(i - 1), snapshots_->at(i));
}
assert(timestamp_size_ == 0 || !full_history_ts_low_ ||
timestamp_size_ == full_history_ts_low_->size());
@@ -347,24 +355,34 @@ bool CompactionIterator::InvokeFilterIfNeeded(bool* need_skip,
env_ != nullptr && report_detailed_time_ ? timer.ElapsedNanos() : 0;
}
- if (decision == CompactionFilter::Decision::kUndetermined) {
+ switch (decision) {
+ default:
+ ROCKSDB_DIE("Bad decision = %d", int(decision));
+ break;
+ case CompactionFilter::Decision::kUndetermined:
// Should not reach here, since FilterV2/FilterV3 should never return
// kUndetermined.
status_ = Status::NotSupported(
"FilterV2/FilterV3 should never return kUndetermined");
validity_info_.Invalidate();
return false;
- }
-
- if (decision == CompactionFilter::Decision::kRemoveAndSkipUntil &&
- cmp_->Compare(*compaction_filter_skip_until_.rep(), ikey_.user_key) <=
+ case CompactionFilter::Decision::kRemoveAndSkipUntil:
+ if (cmp_->Compare(compaction_filter_skip_until_.Encode(), ikey_.user_key) <=
0) {
- // Can't skip to a key smaller than the current one.
- // Keep the key as per FilterV2/FilterV3 documentation.
- decision = CompactionFilter::Decision::kKeep;
- }
-
- if (decision == CompactionFilter::Decision::kRemove) {
+ // Can't skip to a key smaller than the current one.
+ // Keep the key as per FilterV2/FilterV3 documentation.
+ // decision = CompactionFilter::Decision::kKeep;
+ } else {
+ *need_skip = true;
+ compaction_filter_skip_until_.ConvertFromUserKey(kMaxSequenceNumber,
+ kValueTypeForSeek);
+ *skip_until = compaction_filter_skip_until_.Encode();
+ }
+ break;
+ case CompactionFilter::Decision::kKeep:
+ // do nothing
+ break;
+ case CompactionFilter::Decision::kRemove:
// convert the current key to a delete; key_ is pointing into
// current_key_ at this point, so updating current_key_ updates key()
ikey_.type = kTypeDeletion;
@@ -372,7 +390,8 @@ bool CompactionIterator::InvokeFilterIfNeeded(bool* need_skip,
// no value associated with delete
value_.clear();
iter_stats_.num_record_drop_user++;
- } else if (decision == CompactionFilter::Decision::kPurge) {
+ break;
+ case CompactionFilter::Decision::kPurge:
// convert the current key to a single delete; key_ is pointing into
// current_key_ at this point, so updating current_key_ updates key()
ikey_.type = kTypeSingleDeletion;
@@ -380,19 +399,16 @@ bool CompactionIterator::InvokeFilterIfNeeded(bool* need_skip,
// no value associated with single delete
value_.clear();
iter_stats_.num_record_drop_user++;
- } else if (decision == CompactionFilter::Decision::kChangeValue) {
+ break;
+ case CompactionFilter::Decision::kChangeValue:
if (ikey_.type != kTypeValue) {
ikey_.type = kTypeValue;
current_key_.UpdateInternalKey(ikey_.sequence, kTypeValue);
}
value_ = compaction_filter_value_;
- } else if (decision == CompactionFilter::Decision::kRemoveAndSkipUntil) {
- *need_skip = true;
- compaction_filter_skip_until_.ConvertFromUserKey(kMaxSequenceNumber,
- kValueTypeForSeek);
- *skip_until = compaction_filter_skip_until_.Encode();
- } else if (decision == CompactionFilter::Decision::kChangeBlobIndex) {
+ break;
+ case CompactionFilter::Decision::kChangeBlobIndex:
// Only the StackableDB-based BlobDB impl's compaction filter should return
// kChangeBlobIndex. Decision about rewriting blob and changing blob index
// in the integrated BlobDB impl is made in subsequent call to
@@ -411,18 +427,18 @@ bool CompactionIterator::InvokeFilterIfNeeded(bool* need_skip,
}
value_ = compaction_filter_value_;
- } else if (decision == CompactionFilter::Decision::kIOError) {
+ break;
+ case CompactionFilter::Decision::kIOError:
if (!compaction_filter_->IsStackedBlobDbInternalCompactionFilter()) {
status_ = Status::NotSupported(
"CompactionFilter for integrated BlobDB should not return kIOError");
- validity_info_.Invalidate();
- return false;
+ } else {
+ status_ = Status::IOError("Failed to access blob during compaction filter");
}
-
- status_ = Status::IOError("Failed to access blob during compaction filter");
validity_info_.Invalidate();
return false;
- } else if (decision == CompactionFilter::Decision::kChangeWideColumnEntity) {
+ case CompactionFilter::Decision::kChangeWideColumnEntity:
+ {
WideColumns sorted_columns;
sorted_columns.reserve(new_columns.size());
@@ -448,7 +464,9 @@ bool CompactionIterator::InvokeFilterIfNeeded(bool* need_skip,
}
value_ = compaction_filter_value_;
- }
+ }
+ break;
+ } // switch
return true;
}
@@ -466,7 +484,7 @@ void CompactionIterator::NextFromInput() {
is_range_del_ = input_.IsDeleteRangeSentinelKey();
Status pik_status = ParseInternalKey(key_, &ikey_, allow_data_in_errors_);
- if (!pik_status.ok()) {
+ if (UNLIKELY(!pik_status.ok())) {
iter_stats_.num_input_corrupt_records++;
// If `expect_valid_internal_key_` is false, return the corrupted key
@@ -483,7 +501,7 @@ void CompactionIterator::NextFromInput() {
break;
}
TEST_SYNC_POINT_CALLBACK("CompactionIterator:ProcessKV", &ikey_);
- if (is_range_del_) {
+ if (UNLIKELY(is_range_del_)) {
validity_info_.SetValid(kRangeDeletion);
break;
}
@@ -796,6 +814,7 @@ void CompactionIterator::NextFromInput() {
// is an unexpected Merge or Delete. We will compact it out
// either way. We will maintain counts of how many mismatches
// happened
+ ROCKSDB_ASSUME(next_ikey.type < kTypeMaxValid);
if (next_ikey.type != kTypeValue &&
next_ikey.type != kTypeBlobIndex &&
next_ikey.type != kTypeWideColumnEntity) {
@@ -1033,8 +1052,10 @@ void CompactionIterator::NextFromInput() {
// trim_ts.
bool should_delete = false;
if (!timestamp_size_ || cmp_with_history_ts_low_ < 0) {
+ if (!range_del_agg_->IsEmpty()) {
should_delete = range_del_agg_->ShouldDelete(
key_, RangeDelPositioningMode::kForwardTraversal);
+ }
}
if (should_delete) {
++iter_stats_.num_record_drop_hidden;
@@ -1248,6 +1269,7 @@ void CompactionIterator::DecideOutputLevel() {
}
}
+ROCKSDB_FLATTEN
void CompactionIterator::PrepareOutput() {
if (Valid()) {
if (LIKELY(!is_range_del_)) {
@@ -1260,7 +1282,7 @@ void CompactionIterator::PrepareOutput() {
// For range del sentinel, we don't use it to cut files for bottommost
// compaction. So it should not make a difference which output level we
// decide.
- if (compaction_ != nullptr && compaction_->SupportsPerKeyPlacement()) {
+ if (compaction_ != nullptr && supports_per_key_placement_) {
DecideOutputLevel();
}
}
@@ -1277,7 +1299,7 @@ void CompactionIterator::PrepareOutput() {
// Can we do the same for levels above bottom level as long as
// KeyNotExistsBeyondOutputLevel() return true?
if (Valid() && compaction_ != nullptr &&
- !compaction_->allow_ingest_behind() && bottommost_level_ &&
+ !allow_ingest_behind_ && bottommost_level_ &&
DefinitelyInSnapshot(ikey_.sequence, earliest_snapshot_) &&
ikey_.type != kTypeMerge && current_key_committed_ &&
!output_to_penultimate_level_ &&
@@ -1322,15 +1344,19 @@ void CompactionIterator::PrepareOutput() {
inline SequenceNumber CompactionIterator::findEarliestVisibleSnapshot(
SequenceNumber in, SequenceNumber* prev_snapshot) {
+ auto const snapshots_beg = snapshots_->begin();
+ auto const snapshots_end = snapshots_->end();
+ auto const snapshots_num = snapshots_end - snapshots_beg;
assert(snapshots_->size());
- if (snapshots_->size() == 0) {
+ if (snapshots_num == 0) {
ROCKS_LOG_FATAL(info_log_,
"No snapshot left in findEarliestVisibleSnapshot");
}
auto snapshots_iter =
- std::lower_bound(snapshots_->begin(), snapshots_->end(), in);
+ //std::lower_bound(snapshots_->begin(), snapshots_->end(), in);
+ snapshots_beg + terark::lower_bound_0(snapshots_beg, snapshots_num, in);
assert(prev_snapshot != nullptr);
- if (snapshots_iter == snapshots_->begin()) {
+ if (snapshots_iter == snapshots_beg) {
*prev_snapshot = 0;
} else {
*prev_snapshot = *std::prev(snapshots_iter);
@@ -1343,11 +1369,11 @@ inline SequenceNumber CompactionIterator::findEarliestVisibleSnapshot(
}
}
if (snapshot_checker_ == nullptr) {
- return snapshots_iter != snapshots_->end() ? *snapshots_iter
+ return snapshots_iter != snapshots_end ? *snapshots_iter
: kMaxSequenceNumber;
}
bool has_released_snapshot = !released_snapshots_.empty();
- for (; snapshots_iter != snapshots_->end(); ++snapshots_iter) {
+ for (; snapshots_iter != snapshots_end; ++snapshots_iter) {
auto cur = *snapshots_iter;
if (in > cur) {
ROCKS_LOG_FATAL(info_log_,
@@ -1417,7 +1443,7 @@ std::unique_ptr CompactionIterator::CreateBlobFetcherIfNeeded(
read_options.io_activity = Env::IOActivity::kCompaction;
read_options.fill_cache = false;
- return std::unique_ptr(new BlobFetcher(version, read_options));
+ return std::make_unique(version, read_options);
}
std::unique_ptr
diff --git a/db/compaction/compaction_iterator.h b/db/compaction/compaction_iterator.h
index 1ff9c88692..dbf315ba1b 100644
--- a/db/compaction/compaction_iterator.h
+++ b/db/compaction/compaction_iterator.h
@@ -369,6 +369,8 @@ class CompactionIterator {
SystemClock* clock_;
const bool report_detailed_time_;
const bool expect_valid_internal_key_;
+ const bool allow_ingest_behind_;
+ const bool supports_per_key_placement_;
CompactionRangeDelAggregator* range_del_agg_;
BlobFileBuilder* blob_file_builder_;
std::unique_ptr compaction_;
diff --git a/db/compaction/compaction_job.cc b/db/compaction/compaction_job.cc
index 99b099759d..ed3ecc8b48 100644
--- a/db/compaction/compaction_job.cc
+++ b/db/compaction/compaction_job.cc
@@ -8,6 +8,7 @@
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#include "db/compaction/compaction_job.h"
+#include "compaction_executor.h"
#include
#include
@@ -47,6 +48,8 @@
#include "rocksdb/db.h"
#include "rocksdb/env.h"
#include "rocksdb/options.h"
+#include "rocksdb/merge_operator.h"
+#include "rocksdb/sst_partitioner.h"
#include "rocksdb/statistics.h"
#include "rocksdb/status.h"
#include "rocksdb/table.h"
@@ -195,6 +198,11 @@ CompactionJob::CompactionJob(
ThreadStatusUtil::SetEnableTracking(db_options_.enable_thread_tracking);
ThreadStatusUtil::SetColumnFamily(cfd);
ThreadStatusUtil::SetThreadOperation(ThreadStatus::OP_COMPACTION);
+ for (auto& level : *compaction->inputs()) {
+ for (auto& file : level.files) {
+ file->job_id = job_id;
+ }
+ }
ReportStartedCompaction(compaction);
}
@@ -356,6 +364,7 @@ uint64_t CompactionJob::GetSubcompactionsLimit() {
void CompactionJob::AcquireSubcompactionResources(
int num_extra_required_subcompactions) {
+#if defined(ROCKSDB_UNIT_TEST)
TEST_SYNC_POINT("CompactionJob::AcquireSubcompactionResources:0");
TEST_SYNC_POINT("CompactionJob::AcquireSubcompactionResources:1");
int max_db_compactions =
@@ -392,9 +401,11 @@ void CompactionJob::AcquireSubcompactionResources(
} else {
*bg_compaction_scheduled_ += extra_num_subcompaction_threads_reserved_;
}
+#endif
}
void CompactionJob::ShrinkSubcompactionResources(uint64_t num_extra_resources) {
+#if defined(ROCKSDB_UNIT_TEST)
// Do nothing when we have zero resources to shrink
if (num_extra_resources == 0) return;
db_mutex_->Lock();
@@ -419,9 +430,11 @@ void CompactionJob::ShrinkSubcompactionResources(uint64_t num_extra_resources) {
}
db_mutex_->Unlock();
TEST_SYNC_POINT("CompactionJob::ShrinkSubcompactionResources:0");
+#endif
}
void CompactionJob::ReleaseSubcompactionResources() {
+#if defined(ROCKSDB_UNIT_TEST)
if (extra_num_subcompaction_threads_reserved_ == 0) {
return;
}
@@ -440,6 +453,7 @@ void CompactionJob::ReleaseSubcompactionResources() {
1 + extra_num_subcompaction_threads_reserved_);
}
ShrinkSubcompactionResources(extra_num_subcompaction_threads_reserved_);
+#endif
}
struct RangeWithSize {
@@ -478,11 +492,15 @@ void CompactionJob::GenSubcompactionBoundaries() {
// cause relatively small inaccuracy.
const ReadOptions read_options(Env::IOActivity::kCompaction);
auto* c = compact_->compaction;
+#if defined(ROCKSDB_UNIT_TEST)
if (c->max_subcompactions() <= 1 &&
!(c->immutable_options()->compaction_pri == kRoundRobin &&
c->immutable_options()->compaction_style == kCompactionStyleLevel)) {
return;
}
+#else
+ if (c->max_subcompactions() <= 1) return;
+#endif
auto* cfd = c->column_family_data();
const Comparator* cfd_comparator = cfd->user_comparator();
const InternalKeyComparator& icomp = cfd->internal_comparator();
@@ -548,6 +566,7 @@ void CompactionJob::GenSubcompactionBoundaries() {
}),
all_anchors.end());
+#if defined(ROCKSDB_UNIT_TEST)
// Get the number of planned subcompactions, may update reserve threads
// and update extra_num_subcompaction_threads_reserved_ for round-robin
uint64_t num_planned_subcompactions;
@@ -580,6 +599,9 @@ void CompactionJob::GenSubcompactionBoundaries() {
} else {
num_planned_subcompactions = GetSubcompactionsLimit();
}
+#else
+ uint64_t num_planned_subcompactions = std::max(1u, c->max_subcompactions());
+#endif
TEST_SYNC_POINT_CALLBACK("CompactionJob::GenSubcompactionBoundaries:0",
&num_planned_subcompactions);
@@ -620,6 +642,23 @@ void CompactionJob::GenSubcompactionBoundaries() {
}
Status CompactionJob::Run() {
+ auto icf_opt = compact_->compaction->immutable_options();
+ auto exec = icf_opt->compaction_executor_factory.get();
+ if (!exec || exec->ShouldRunLocal(compact_->compaction)) {
+ return RunLocal();
+ }
+ Status s = RunRemote();
+ if (!s.ok()) {
+ if (exec->AllowFallbackToLocal()) {
+ s = RunLocal();
+ } else {
+ // fatal, rocksdb does not handle compact errors properly
+ }
+ }
+ return s;
+}
+
+Status CompactionJob::RunLocal() {
AutoThreadOperationStageUpdater stage_updater(
ThreadStatus::STAGE_COMPACTION_RUN);
TEST_SYNC_POINT("CompactionJob::Run():Start");
@@ -646,6 +685,26 @@ Status CompactionJob::Run() {
for (auto& thread : thread_pool) {
thread.join();
}
+ auto GetPath = [this]() {
+ size_t pathId = compact_->compaction->output_path_id();
+ auto& paths = compact_->compaction->immutable_options()->cf_paths;
+ return paths[std::min(paths.size()-1, pathId)].path.c_str();
+ };
+ for (const auto& state : compact_->sub_compact_states) {
+ std::string filelist;
+ long long size = 0;
+ for (const auto& output : state.GetOutputs()) {
+ auto& fd = output.meta.fd;
+ char buf[32];
+ auto len = sprintf(buf, "%06lld,", (long long)fd.GetNumber());
+ filelist.append(buf, len);
+ size += fd.file_size;
+ }
+ if (!filelist.empty()) filelist.pop_back();
+ ROCKS_LOG_INFO(db_options_.info_log,
+ "job-%05d: subcompact[%d], size: %.6f G, files: %s [%s]",
+ job_id_, state.sub_job_id, size/1e9, GetPath(), filelist.c_str());
+ }
compaction_stats_.SetMicros(db_options_.clock->NowMicros() - start_micros);
@@ -654,8 +713,28 @@ Status CompactionJob::Run() {
state.RemoveLastEmptyOutput();
}
- RecordTimeToHistogram(stats_, COMPACTION_TIME,
- compaction_stats_.stats.micros);
+ for (size_t i = 0; i < compact_->sub_compact_states.size(); i++) {
+ auto& sub = compact_->sub_compact_states[i];
+ for (size_t j = 0; j < sub.outputs.size(); ++j) {
+ auto& meta = sub.outputs[j].meta;
+ auto raw = meta.raw_key_size + meta.raw_value_size;
+ auto zip = meta.fd.file_size;
+ RecordTick(stats_, LCOMPACT_WRITE_BYTES_RAW, raw);
+ RecordTimeToHistogram(stats_, LCOMPACTION_OUTPUT_FILE_RAW_SIZE, raw);
+ RecordTimeToHistogram(stats_, LCOMPACTION_OUTPUT_FILE_ZIP_SIZE, zip);
+ }
+ }
+ uint64_t sum_raw = 0, sum_zip = 0;
+ for (auto& each_level : *compact_->compaction->inputs()) {
+ for (FileMetaData* fmd : each_level.files) {
+ sum_raw += fmd->raw_key_size + fmd->raw_value_size;
+ sum_zip += fmd->fd.file_size;
+ }
+ }
+ RecordTimeToHistogram(stats_, LCOMPACTION_INPUT_RAW_BYTES, sum_raw);
+ RecordTimeToHistogram(stats_, LCOMPACTION_INPUT_ZIP_BYTES, sum_zip);
+
+ RecordTimeToHistogram(stats_, COMPACTION_TIME, compaction_stats_.stats.micros);
RecordTimeToHistogram(stats_, COMPACTION_CPU_TIME,
compaction_stats_.stats.cpu_micros);
@@ -752,6 +831,8 @@ Status CompactionJob::Run() {
OutputValidator validator(cfd->internal_comparator(),
/*_enable_order_check=*/true,
/*_enable_hash=*/true);
+ auto& fd = files_output[file_idx]->meta.fd;
+ validator.m_file_number = fd.GetNumber();
for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
s = validator.Add(iter->key(), iter->value());
if (!s.ok()) {
@@ -763,7 +844,13 @@ Status CompactionJob::Run() {
}
if (s.ok() &&
!validator.CompareValidator(files_output[file_idx]->validator)) {
- s = Status::Corruption("Paranoid checksums do not match");
+ #if !defined(ROCKSDB_UNIT_TEST)
+ ROCKSDB_DIE("Compact: Paranoid checksums do not match(%s/%lld.sst)",
+ compact_->compaction->output_path().path.c_str(),
+ (long long)fd.GetNumber());
+ #else
+ s = Status::Corruption("Compact: Paranoid checksums do not match");
+ #endif
}
}
@@ -850,9 +937,269 @@ Status CompactionJob::Run() {
return status;
}
+void CompactionJob::GetSubCompactOutputs(
+ std::vector >* outputs) const {
+ outputs->clear();
+ outputs->reserve(compact_->sub_compact_states.size());
+ for (const auto& state : compact_->sub_compact_states) {
+ outputs->emplace_back();
+ auto& cur_sub = outputs->back();
+ for (const auto& output : state.outputs) {
+ cur_sub.push_back(&output.meta);
+ }
+ }
+}
+
+Status CompactionJob::RunRemote()
+try {
+ ROCKSDB_VERIFY_F(nullptr == snapshot_checker_,
+ "dcompact does not support snapshot_checker, ex: WritePreparedTxnDB "
+ "and WriteUnpreparedTxnDB are not supported because they use "
+ "WritePreparedSnapshotChecker"
+ );
+
+ AutoThreadOperationStageUpdater stage_updater(
+ ThreadStatus::STAGE_COMPACTION_RUN);
+ TEST_SYNC_POINT("CompactionJob::RunRemote():Start");
+ log_buffer_->FlushBufferToLog();
+ LogCompaction();
+
+ size_t num_threads = compact_->sub_compact_states.size();
+ assert(num_threads > 0);
+ const Compaction* c = compact_->compaction;
+ ColumnFamilyData* cfd = c->column_family_data();
+ auto imm_cfo = c->immutable_options();
+ auto mut_cfo = c->mutable_cf_options();
+
+ // if with compaction filter, always use compaction filter factory
+ assert(nullptr == imm_cfo->compaction_filter);
+ CompactionParams rpc_params;
+ CompactionResults rpc_results;
+
+ rpc_results.status = Status::Incomplete("Just Created");
+ rpc_params.job_id = job_id_;
+ rpc_params.version_set.From(versions_);
+ #if (ROCKSDB_MAJOR * 10000 + ROCKSDB_MINOR * 10 + ROCKSDB_PATCH) < 70030
+ rpc_params.preserve_deletes_seqnum = preserve_deletes_seqnum_;
+ #endif
+ rpc_params.existing_snapshots = &existing_snapshots_;
+ rpc_params.earliest_write_conflict_snapshot = earliest_write_conflict_snapshot_;
+ rpc_params.paranoid_file_checks = paranoid_file_checks_;
+ rpc_params.dbname = this->dbname_;
+ rpc_params.db_id = this->db_id_;
+ rpc_params.db_session_id = this->db_session_id_;
+ rpc_params.full_history_ts_low = this->full_history_ts_low_;
+//rpc_params.compaction_job_stats = this->compaction_job_stats_;
+//rpc_params.max_subcompactions = uint32_t(num_threads);
+ rpc_params.max_subcompactions = c->max_subcompactions();
+ rpc_params.shutting_down = this->shutting_down_;
+
+ const uint64_t start_micros = env_->NowMicros();
+ auto exec_factory = imm_cfo->compaction_executor_factory.get();
+ assert(nullptr != exec_factory);
+ auto exec = exec_factory->NewExecutor(c);
+ std::unique_ptr exec_auto_del(exec);
+ exec->SetParams(&rpc_params, c);
+ Status s = exec->Execute(rpc_params, &rpc_results);
+ if (!s.ok()) {
+ compact_->status = s;
+ return s;
+ }
+ if (!rpc_results.status.ok()) {
+ compact_->status = rpc_results.status;
+ return rpc_results.status;
+ }
+ //exec->NotifyResults(&rpc_results, c);
+
+ // remote compact fabricates a version_set, which may cause
+ // GenSubcompactionBoundaries yield different num of sub_compact_states,
+ // thus makes the following assert fail:
+ //assert(rpc_results.output_files.size() == num_threads); // can be diff
+
+ const uint64_t elapsed_us = env_->NowMicros() - start_micros;
+ compaction_stats_.stats = rpc_results.compaction_stats;
+ *compaction_job_stats_ = rpc_results.job_stats;
+
+ // remote statistics will be merged to stat_ later: stats_->Merge(..)
+ //RecordTimeToHistogram(stats_, COMPACTION_TIME, compaction_stats_.micros);
+ //RecordTimeToHistogram(stats_, COMPACTION_CPU_TIME, compaction_stats_.cpu_micros);
+
+ TablePropertiesCollection tp_map;
+ auto& cf_paths = imm_cfo->cf_paths;
+ compact_->num_output_files = 0;
+
+ if (rpc_results.output_files.size() != num_threads) {
+ size_t result_sub_num = rpc_results.output_files.size();
+ // this will happen, but is rare, log it
+ ROCKS_LOG_INFO(db_options_.info_log,
+ "job-%05d: subcompact num diff: rpc = %zd, local = %zd",
+ job_id_, result_sub_num, num_threads);
+ num_threads = result_sub_num;
+ auto& sub_vec = compact_->sub_compact_states;
+ while (sub_vec.size() < result_sub_num) {
+ int sub_job_id = (int)sub_vec.size();
+ sub_vec.emplace_back(compact_->compaction, nullptr, nullptr, sub_job_id);
+ }
+ while (sub_vec.size() > result_sub_num) {
+ sub_vec.pop_back();
+ }
+ }
+
+ long long rename_t0 = env_->NowMicros();
+ size_t out_raw_bytes = 0;
+ uint64_t epoch_number = c->MinInputFileEpochNumber();
+ for (size_t i = 0; i < num_threads; ++i) {
+ auto& sub_state = compact_->sub_compact_states[i];
+ for (const auto& min_meta : rpc_results.output_files[i]) {
+ auto old_fnum = min_meta.file_number;
+ auto old_fname = MakeTableFileName(rpc_results.output_dir, old_fnum);
+ auto path_id = c->output_path_id();
+ uint64_t file_number = versions_->NewFileNumber();
+ std::string new_fname = TableFileName(cf_paths, file_number, path_id);
+ Status st = exec->RenameFile(old_fname, new_fname, min_meta.file_size);
+ if (!st.ok()) {
+ ROCKS_LOG_ERROR(db_options_.info_log, "rename(%s, %s) = %s",
+ old_fname.c_str(), new_fname.c_str(), st.ToString().c_str());
+ compact_->status = st;
+ return st;
+ }
+ FileDescriptor fd(file_number, path_id, min_meta.file_size,
+ min_meta.smallest_seqno, min_meta.largest_seqno);
+ FileMetaData meta;
+ meta.fd = fd;
+ TableCache* tc = cfd->table_cache();
+ TableCache::TypedHandle* ch = nullptr;
+ auto& icmp = cfd->internal_comparator();
+ auto& fopt = *cfd->soptions(); // file_options
+ auto& pref_ext = mut_cfo->prefix_extractor;
+ st = tc->FindTable(ReadOptions(), fopt, icmp, meta, &ch,
+ mut_cfo->block_protection_bytes_per_key, pref_ext);
+ if (!st.ok()) {
+ compact_->status = st;
+ return st;
+ }
+ assert(nullptr != ch);
+ TableReader* tr = tc->GetTableReaderFromHandle(ch);
+ auto tp = tr->GetTableProperties();
+ tp_map[new_fname] = tp;
+ out_raw_bytes += tp->raw_key_size + tp->raw_value_size;
+ tc->ReleaseHandle(ch); // end use of TableReader in handle
+ meta.smallest = min_meta.smallest_ikey;
+ meta.largest = min_meta.largest_ikey;
+ meta.num_deletions = tp->num_deletions;
+ meta.num_entries = tp->num_entries;
+ meta.raw_key_size = tp->raw_key_size;
+ meta.raw_value_size = tp->raw_value_size;
+ meta.marked_for_compaction = min_meta.marked_for_compaction;
+ meta.epoch_number = epoch_number;
+ bool enable_order_check = mut_cfo->check_flush_compaction_key_order;
+ bool enable_hash = paranoid_file_checks_;
+ uint64_t precalculated_hash = 0;
+ sub_state.outputs.emplace_back(std::move(meta), icmp,
+ enable_order_check, enable_hash, true, precalculated_hash);
+ sub_state.total_bytes += min_meta.file_size;
+ sub_state.num_output_records += tp->num_entries;
+ rpc_results.output_index_size += tp->index_size;
+ rpc_results.output_data_size += tp->data_size;
+ }
+ // instead AggregateStatistics:
+ compact_->num_output_files += sub_state.outputs.size();
+ compact_->total_bytes += sub_state.total_bytes;
+ compact_->num_output_records += sub_state.num_output_records;
+ }
+ compact_->compaction->SetOutputTableProperties(std::move(tp_map));
+ long long rename_t1 = env_->NowMicros();
+
+ {
+ Compaction::InputLevelSummaryBuffer inputs_summary; // NOLINT
+ double work_time_us = rpc_results.work_time_usec;
+ if (work_time_us <= 1) work_time_us = 1;
+ ROCKS_LOG_INFO(db_options_.info_log,
+ "[%s] [JOB %d] Dcompacted %s [%zd] => time sec: "
+ "curl = %6.3f, mount = %6.3f, prepare = %6.3f, "
+ "wait = %6.3f, work = %6.3f, e2e = %6.3f, rename = %6.3f, "
+ "out zip = %9.6f GB %8.3f MB/sec, "
+ "out raw = %9.6f GB %8.3f MB/sec",
+ c->column_family_data()->GetName().c_str(), job_id_,
+ c->InputLevelSummary(&inputs_summary), compact_->num_output_files,
+ rpc_results.curl_time_usec/1e6,
+ rpc_results.mount_time_usec/1e6,
+ rpc_results.prepare_time_usec/1e6,
+ (elapsed_us - work_time_us)/1e6, // wait is non-work
+ work_time_us/1e6, elapsed_us/1e6, (rename_t1 - rename_t0)/1e9,
+ compact_->total_bytes/1e9, compact_->total_bytes/work_time_us,
+ out_raw_bytes/1e9, out_raw_bytes/work_time_us);
+ }
+ // Finish up all book-keeping to unify the subcompaction results
+ // these were run on remote compaction worker node
+ //AggregateStatistics();
+ //UpdateCompactionStats();
+ //compaction_job_stats_->Add(rpc_results.job_stats); // instead AggregateStatistics
+
+ //RecordCompactionIOStats(); // update remote statistics to local -->>
+#if defined(__GNUC__) && !defined(__clang__)
+ #pragma GCC diagnostic push
+ #pragma GCC diagnostic ignored "-Wclass-memaccess"
+#endif
+#define MoveHG(dst,src) \
+ memcpy(&rpc_results.statistics.histograms[dst], \
+ &rpc_results.statistics.histograms[src], \
+ sizeof rpc_results.statistics.histograms[src]), \
+ rpc_results.statistics.histograms[src].Clear()
+ MoveHG(DCOMPACTION_INPUT_RAW_BYTES, LCOMPACTION_INPUT_RAW_BYTES);
+ MoveHG(DCOMPACTION_INPUT_ZIP_BYTES, LCOMPACTION_INPUT_ZIP_BYTES);
+ MoveHG(DCOMPACTION_OUTPUT_FILE_RAW_SIZE, LCOMPACTION_OUTPUT_FILE_RAW_SIZE);
+ MoveHG(DCOMPACTION_OUTPUT_FILE_ZIP_SIZE, LCOMPACTION_OUTPUT_FILE_ZIP_SIZE);
+#if defined(__GNUC__) && !defined(__clang__)
+ #pragma GCC diagnostic pop
+#endif
+
+#define MoveTK(dst, src) \
+ rpc_results.statistics.tickers[dst] = rpc_results.statistics.tickers[src]; \
+ rpc_results.statistics.tickers[src] = 0
+
+ MoveTK(DCOMPACT_WRITE_BYTES_RAW, LCOMPACT_WRITE_BYTES_RAW);
+ MoveTK(REMOTE_COMPACT_READ_BYTES, COMPACT_READ_BYTES);
+ MoveTK(REMOTE_COMPACT_WRITE_BYTES, COMPACT_WRITE_BYTES);
+
+ stats_->Merge(rpc_results.statistics.tickers,
+ rpc_results.statistics.histograms);
+
+ LogFlush(db_options_.info_log);
+ TEST_SYNC_POINT("CompactionJob::RunRemote():End");
+
+ exec->CleanFiles(rpc_params, rpc_results);
+
+ compact_->status = Status::OK();
+ return Status::OK();
+}
+catch (const std::exception& ex) {
+ compact_->status = Status::Corruption(ROCKSDB_FUNC, ex.what());
+ return compact_->status;
+}
+catch (const Status& s) {
+ compact_->status = s;
+ return s;
+}
+
Status CompactionJob::Install(const MutableCFOptions& mutable_cf_options,
bool* compaction_released) {
assert(compact_);
+#if 0
+ // this fails unit test:
+ // DBCompactionTestBlobError/DBCompactionTestBlobError.CompactionError/1
+ // and does not help for error checking
+ if (!compact_->status.ok()) { // caller does not check retval of Run()
+ ColumnFamilyData* cfd = compact_->compaction->column_family_data();
+ assert(cfd);
+ ROCKS_LOG_BUFFER(log_buffer_, "[%s] compaction failed, job_id = %d : %s",
+ cfd->GetName().c_str(), job_id_,
+ compact_->status.ToString().c_str());
+ Status s = compact_->status;
+ CleanupCompaction();
+ return s;
+ }
+#endif
AutoThreadOperationStageUpdater stage_updater(
ThreadStatus::STAGE_COMPACTION_INSTALL);
@@ -952,6 +1299,7 @@ Status CompactionJob::Install(const MutableCFOptions& mutable_cf_options,
UpdateCompactionJobStats(stats);
auto stream = event_logger_->LogToBuffer(log_buffer_, 8192);
+ stream << "cf" << cfd->GetName();
stream << "job" << job_id_ << "event"
<< "compaction_finished"
<< "compaction_time_micros" << stats.micros
@@ -1196,7 +1544,7 @@ void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) {
std::unique_ptr clip;
if (start.has_value() || end.has_value()) {
- clip = std::make_unique(
+ clip = MakeClippingIterator(
raw_input.get(), start.has_value() ? &start_slice : nullptr,
end.has_value() ? &end_slice : nullptr, &cfd->internal_comparator());
input = clip.get();
@@ -1621,6 +1969,10 @@ Status CompactionJob::FinishCompactionOutputFile(
TableProperties tp;
if (s.ok()) {
tp = outputs.GetTableProperties();
+ meta->num_entries = tp.num_entries;
+ meta->num_deletions = tp.num_deletions;
+ meta->raw_key_size = tp.raw_key_size;
+ meta->raw_value_size = tp.raw_value_size;
}
if (s.ok() && current_entries == 0 && tp.num_range_deletions == 0) {
@@ -1773,6 +2125,7 @@ Status CompactionJob::InstallCompactionResults(
stats.GetBytes());
}
+#if defined(ROCKSDB_UNIT_TEST)
if ((compaction->compaction_reason() ==
CompactionReason::kLevelMaxLevelSize ||
compaction->compaction_reason() == CompactionReason::kRoundRobinTtl) &&
@@ -1785,6 +2138,7 @@ Status CompactionJob::InstallCompactionResults(
start_level, compaction->num_input_files(0)));
}
}
+#endif
auto manifest_wcb = [&compaction, &compaction_released](const Status& s) {
compaction->ReleaseCompactionFiles(s);
@@ -2077,15 +2431,17 @@ void CompactionJob::LogCompaction() {
if (db_options_.info_log_level <= InfoLogLevel::INFO_LEVEL) {
Compaction::InputLevelSummaryBuffer inputs_summary;
ROCKS_LOG_INFO(
- db_options_.info_log, "[%s] [JOB %d] Compacting %s, score %.2f",
+ db_options_.info_log,
+ "[%s] [JOB %d] Compacting %s, score %.2f, subcompactions %d : %zd",
cfd->GetName().c_str(), job_id_,
- compaction->InputLevelSummary(&inputs_summary), compaction->score());
+ compaction->InputLevelSummary(&inputs_summary), compaction->score(),
+ compaction->max_subcompactions(), compact_->sub_compact_states.size());
char scratch[2345];
compaction->Summary(scratch, sizeof(scratch));
ROCKS_LOG_INFO(db_options_.info_log, "[%s]: Compaction start summary: %s\n",
cfd->GetName().c_str(), scratch);
// build event logger report
- auto stream = event_logger_->Log();
+ auto stream = event_logger_->LogToBuffer(log_buffer_, 64*1024);
stream << "job" << job_id_ << "event"
<< "compaction_started"
<< "compaction_reason"
diff --git a/db/compaction/compaction_job.h b/db/compaction/compaction_job.h
index e812cfc72a..acaf657525 100644
--- a/db/compaction/compaction_job.h
+++ b/db/compaction/compaction_job.h
@@ -194,6 +194,10 @@ class CompactionJob {
// Return the IO status
IOStatus io_status() const { return io_status_; }
+ void GetSubCompactOutputs(std::vector >*) const;
+ CompactionJobStats* GetCompactionJobStats() const { return compaction_job_stats_; }
+ const InternalStats::CompactionStatsFull& GetCompactionStats() const { return compaction_stats_; }
+
protected:
// Update the following stats in compaction_stats_.stats
// - num_input_files_in_non_output_levels
@@ -289,6 +293,9 @@ class CompactionJob {
void NotifyOnSubcompactionCompleted(SubcompactionState* sub_compact);
+ Status RunLocal();
+ Status RunRemote();
+
uint32_t job_id_;
// DBImpl state
@@ -362,6 +369,8 @@ class CompactionJob {
// the last level (output to penultimate level).
SequenceNumber preclude_last_level_min_seqno_ = kMaxSequenceNumber;
+ std::vector > rand_key_store_;
+
// Get table file name in where it's outputting to, which should also be in
// `output_directory_`.
virtual std::string GetTableFileName(uint64_t file_number);
diff --git a/db/compaction/compaction_outputs.cc b/db/compaction/compaction_outputs.cc
index eb76cd849a..62949191a4 100644
--- a/db/compaction/compaction_outputs.cc
+++ b/db/compaction/compaction_outputs.cc
@@ -121,20 +121,29 @@ bool CompactionOutputs::UpdateFilesToCutForTTLStates(
return false;
}
-size_t CompactionOutputs::UpdateGrandparentBoundaryInfo(
- const Slice& internal_key) {
- size_t curr_key_boundary_switched_num = 0;
- const std::vector& grandparents = compaction_->grandparents();
-
- if (grandparents.empty()) {
- return curr_key_boundary_switched_num;
+ROCKSDB_FLATTEN
+size_t CompactionOutputs::UpdateGrandparentBoundaryInfo(const Slice& ikey) {
+ if (0 == grandparents_size_) {
+ return 0;
}
- const Comparator* ucmp = compaction_->column_family_data()->user_comparator();
+ if (cmp_meta_.IsForwardBytewise())
+ return UpdateGrandparentBoundaryInfoTmpl(ForwardBytewiseCompareUserKeyNoTS(), ikey);
+ if (cmp_meta_.IsReverseBytewise())
+ return UpdateGrandparentBoundaryInfoTmpl(ReverseBytewiseCompareUserKeyNoTS(), ikey);
+ else
+ return UpdateGrandparentBoundaryInfoTmpl(VirtualFunctionCompareUserKeyNoTS
+ {compaction_->immutable_options()->user_comparator}, ikey);
+}
+template
+size_t CompactionOutputs::UpdateGrandparentBoundaryInfoTmpl(UKCmpNoTS ucmp, const Slice& ikey) {
+ size_t curr_key_boundary_switched_num = 0;
+ const auto grandparents = grandparents_data_;
+ const auto grandparents_size = grandparents_size_;
// Move the grandparent_index_ to the file containing the current user_key.
// If there are multiple files containing the same user_key, make sure the
// index points to the last file containing the key.
- while (grandparent_index_ < grandparents.size()) {
+ while (grandparent_index_ < grandparents_size) {
if (being_grandparent_gap_) {
if (sstableKeyCompare(ucmp, internal_key,
grandparents[grandparent_index_]->smallest) < 0) {
@@ -154,8 +163,8 @@ size_t CompactionOutputs::UpdateGrandparentBoundaryInfo(
// one.
if (cmp_result < 0 ||
(cmp_result == 0 &&
- (grandparent_index_ == grandparents.size() - 1 ||
- sstableKeyCompare(ucmp, internal_key,
+ (grandparent_index_ == grandparents_size - 1 ||
+ sstableKeyCompare(ucmp, ikey,
grandparents[grandparent_index_ + 1]->smallest) <
0))) {
break;
@@ -174,7 +183,7 @@ size_t CompactionOutputs::UpdateGrandparentBoundaryInfo(
if (!seen_key_ && !being_grandparent_gap_) {
assert(grandparent_overlapped_bytes_ == 0);
grandparent_overlapped_bytes_ =
- GetCurrentKeyGrandparentOverlappedBytes(internal_key);
+ GetCurrentKeyGrandparentOverlappedBytes(ikey);
}
seen_key_ = true;
@@ -189,7 +198,7 @@ uint64_t CompactionOutputs::GetCurrentKeyGrandparentOverlappedBytes(
}
uint64_t overlapped_bytes = 0;
- const std::vector& grandparents = compaction_->grandparents();
+ const auto grandparents = grandparents_data_;
const Comparator* ucmp = compaction_->column_family_data()->user_comparator();
InternalKey ikey;
ikey.DecodeFrom(internal_key);
@@ -201,7 +210,7 @@ uint64_t CompactionOutputs::GetCurrentKeyGrandparentOverlappedBytes(
assert(
cmp_result < 0 ||
(cmp_result == 0 &&
- (grandparent_index_ == grandparents.size() - 1 ||
+ (grandparent_index_ == grandparents_size_ - 1 ||
sstableKeyCompare(
ucmp, ikey, grandparents[grandparent_index_ + 1]->smallest) < 0)));
assert(sstableKeyCompare(ucmp, ikey,
@@ -236,15 +245,13 @@ bool CompactionOutputs::ShouldStopBefore(const CompactionIterator& c_iter) {
}
#endif // NDEBUG
const uint64_t previous_overlapped_bytes = grandparent_overlapped_bytes_;
- const InternalKeyComparator* icmp =
- &compaction_->column_family_data()->internal_comparator();
size_t num_grandparent_boundaries_crossed = 0;
bool should_stop_for_ttl = false;
// Always update grandparent information like overlapped file number, size
// etc., and TTL states.
// If compaction_->output_level() == 0, there is no need to update grandparent
// info, and that `grandparent` should be empty.
- if (compaction_->output_level() > 0) {
+ if (output_level_ > 0) {
num_grandparent_boundaries_crossed =
UpdateGrandparentBoundaryInfo(internal_key);
should_stop_for_ttl = UpdateFilesToCutForTTLStates(internal_key);
@@ -260,24 +267,25 @@ bool CompactionOutputs::ShouldStopBefore(const CompactionIterator& c_iter) {
// If there's user defined partitioner, check that first
if (partitioner_ && partitioner_->ShouldPartition(PartitionerRequest(
- last_key_for_partitioner_, c_iter.user_key(),
+ SliceOf(last_key_for_partitioner_), c_iter.user_key(),
current_output_file_size_)) == kRequired) {
return true;
}
// files output to Level 0 won't be split
- if (compaction_->output_level() == 0) {
+ if (output_level_ == 0) {
return false;
}
// reach the max file size
- if (current_output_file_size_ >= compaction_->max_output_file_size()) {
+ if (current_output_file_size_ >= max_output_file_size_) {
return true;
}
// Check if it needs to split for RoundRobin
// Invalid local_output_split_key indicates that we do not need to split
if (local_output_split_key_ != nullptr && !is_split_) {
+ auto icmp = &compaction_->immutable_options()->internal_comparator;
// Split occurs when the next key is larger than/equal to the cursor
if (icmp->Compare(internal_key, local_output_split_key_->Encode()) >= 0) {
is_split_ = true;
@@ -293,7 +301,7 @@ bool CompactionOutputs::ShouldStopBefore(const CompactionIterator& c_iter) {
// max_compaction_bytes. Which is to prevent future bigger than
// max_compaction_bytes compaction from the current output level.
if (grandparent_overlapped_bytes_ + current_output_file_size_ >
- compaction_->max_compaction_bytes()) {
+ max_compaction_bytes_) {
return true;
}
@@ -315,13 +323,12 @@ bool CompactionOutputs::ShouldStopBefore(const CompactionIterator& c_iter) {
// More details, check PR #1963
const size_t num_skippable_boundaries_crossed =
being_grandparent_gap_ ? 2 : 3;
- if (compaction_->immutable_options()->compaction_style ==
- kCompactionStyleLevel &&
- compaction_->immutable_options()->level_compaction_dynamic_file_size &&
+ if (compaction_style_ == kCompactionStyleLevel &&
+ level_compaction_dynamic_file_size_ &&
num_grandparent_boundaries_crossed >=
num_skippable_boundaries_crossed &&
grandparent_overlapped_bytes_ - previous_overlapped_bytes >
- compaction_->target_output_file_size() / 8) {
+ target_output_file_size_ / 8) {
return true;
}
@@ -337,11 +344,10 @@ bool CompactionOutputs::ShouldStopBefore(const CompactionIterator& c_iter) {
// target file size. The test shows it can generate larger files than a
// static threshold like 75% and has a similar write amplification
// improvement.
- if (compaction_->immutable_options()->compaction_style ==
- kCompactionStyleLevel &&
- compaction_->immutable_options()->level_compaction_dynamic_file_size &&
+ if (compaction_style_ == kCompactionStyleLevel &&
+ level_compaction_dynamic_file_size_ &&
current_output_file_size_ >=
- ((compaction_->target_output_file_size() + 99) / 100) *
+ ((target_output_file_size_ + 99) / 100) *
(50 + std::min(grandparent_boundary_switched_num_ * 5,
size_t{40}))) {
return true;
@@ -355,17 +361,16 @@ Status CompactionOutputs::AddToOutput(
const CompactionIterator& c_iter,
const CompactionFileOpenFunc& open_file_func,
const CompactionFileCloseFunc& close_file_func) {
- Status s;
bool is_range_del = c_iter.IsDeleteRangeSentinelKey();
if (is_range_del && compaction_->bottommost_level()) {
// We don't consider range tombstone for bottommost level since:
// 1. there is no grandparent and hence no overlap to consider
// 2. range tombstone may be dropped at bottommost level.
- return s;
+ return Status::OK();
}
const Slice& key = c_iter.key();
if (ShouldStopBefore(c_iter) && HasBuilder()) {
- s = close_file_func(*this, c_iter.InputStatus(), key);
+ Status s = close_file_func(*this, c_iter.InputStatus(), key);
if (!s.ok()) {
return s;
}
@@ -384,7 +389,7 @@ Status CompactionOutputs::AddToOutput(
// Open output file if necessary
if (!HasBuilder()) {
- s = open_file_func(*this);
+ Status s = open_file_func(*this);
if (!s.ok()) {
return s;
}
@@ -398,13 +403,12 @@ Status CompactionOutputs::AddToOutput(
}
if (UNLIKELY(is_range_del)) {
- return s;
+ return Status::OK();
}
assert(builder_ != nullptr);
const Slice& value = c_iter.value();
- s = current_output().validator.Add(key, value);
- if (!s.ok()) {
+ if (Status s = current_output().validator.Add(key, value); !s.ok()) {
return s;
}
builder_->Add(key, value);
@@ -413,15 +417,14 @@ Status CompactionOutputs::AddToOutput(
current_output_file_size_ = builder_->EstimatedFileSize();
if (blob_garbage_meter_) {
- s = blob_garbage_meter_->ProcessOutFlow(key, value);
- }
-
- if (!s.ok()) {
- return s;
+ Status s = blob_garbage_meter_->ProcessOutFlow(key, value);
+ if (!s.ok()) {
+ return s;
+ }
}
const ParsedInternalKey& ikey = c_iter.ikey();
- s = current_output().meta.UpdateBoundaries(key, value, ikey.sequence,
+ Status s = current_output().meta.UpdateBoundaries(key, value, ikey.sequence,
ikey.type);
return s;
@@ -780,6 +783,17 @@ void CompactionOutputs::FillFilesToCutForTtl() {
CompactionOutputs::CompactionOutputs(const Compaction* compaction,
const bool is_penultimate_level)
: compaction_(compaction), is_penultimate_level_(is_penultimate_level) {
+ auto& io = *compaction->immutable_options();
+ cmp_meta_ = *io.user_comparator;
+ compaction_style_ = io.compaction_style;
+ level_compaction_dynamic_file_size_ = io.level_compaction_dynamic_file_size;
+ output_level_ = compaction->output_level();
+ max_compaction_bytes_ = compaction->max_compaction_bytes();
+ max_output_file_size_ = compaction->max_output_file_size();
+ target_output_file_size_ = compaction->target_output_file_size();
+ grandparents_data_ = compaction->grandparents().data();
+ grandparents_size_ = compaction->grandparents().size();
+
partitioner_ = compaction->output_level() == 0
? nullptr
: compaction->CreateSstPartitioner();
diff --git a/db/compaction/compaction_outputs.h b/db/compaction/compaction_outputs.h
index 18246cf2fa..560ef95cdd 100644
--- a/db/compaction/compaction_outputs.h
+++ b/db/compaction/compaction_outputs.h
@@ -15,6 +15,7 @@
#include "db/compaction/compaction_iterator.h"
#include "db/internal_stats.h"
#include "db/output_validator.h"
+#include
namespace ROCKSDB_NAMESPACE {
@@ -190,6 +191,8 @@ class CompactionOutputs {
return range_del_agg_ && !range_del_agg_->IsEmpty();
}
+ std::vector