From 1e722dd8c4402be1c90dc41db874099af603b39d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kasper=20Isager=20Dalsgar=C3=B0?= Date: Thu, 9 Nov 2023 10:41:38 -0800 Subject: [PATCH 01/18] Ensure `target_include_directories()` is called with correct target name (#12055) Summary: `${PROJECT_NAME}` isn't guaranteed to match a target name when an artefact suffix is specified. Pull Request resolved: https://github.com/facebook/rocksdb/pull/12055 Reviewed By: anand1976 Differential Revision: D51125532 Pulled By: ajkr fbshipit-source-id: cd1f4a5b11eb517c379e3ee3f78592f7e606a034 --- CMakeLists.txt | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index ca232b7db..8b759d35f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1171,11 +1171,15 @@ set(BUILD_VERSION_CC ${CMAKE_BINARY_DIR}/build_version.cc) configure_file(util/build_version.cc.in ${BUILD_VERSION_CC} @ONLY) add_library(${ROCKSDB_STATIC_LIB} STATIC ${SOURCES} ${BUILD_VERSION_CC}) +target_include_directories(${ROCKSDB_STATIC_LIB} PUBLIC + $) target_link_libraries(${ROCKSDB_STATIC_LIB} PRIVATE ${THIRDPARTY_LIBS} ${SYSTEM_LIBS}) if(ROCKSDB_BUILD_SHARED) add_library(${ROCKSDB_SHARED_LIB} SHARED ${SOURCES} ${BUILD_VERSION_CC}) + target_include_directories(${ROCKSDB_SHARED_LIB} PUBLIC + $) target_link_libraries(${ROCKSDB_SHARED_LIB} PRIVATE ${THIRDPARTY_LIBS} ${SYSTEM_LIBS}) From c0c7d20dea0ef4c1f0d6a5e5ef94d3832e8b6ce4 Mon Sep 17 00:00:00 2001 From: akankshamahajan Date: Thu, 27 Jul 2023 12:02:03 -0700 Subject: [PATCH 02/18] Fix use_after_free bug when underlying FS enables kFSBuffer (#11645) Summary: Fix use_after_free bug in async_io MultiReads when underlying FS enabled kFSBuffer. kFSBuffer is when underlying FS pass their own buffer instead of using RocksDB scratch in FSReadRequest Since it's an experimental feature, added a hack for now to fix the bug. Planning to make public API change to remove const from the callback as it doesn't make sense to use const. Pull Request resolved: https://github.com/facebook/rocksdb/pull/11645 Test Plan: tested locally Reviewed By: ltamasi Differential Revision: D47819907 Pulled By: akankshamahajan15 fbshipit-source-id: 1faf5ef795bf27e2b3a60960374d91274931df8d --- unreleased_history/bug_fixes/fsbuffer_bug_fix.md | 1 + util/async_file_reader.cc | 5 +++++ 2 files changed, 6 insertions(+) create mode 100644 unreleased_history/bug_fixes/fsbuffer_bug_fix.md diff --git a/unreleased_history/bug_fixes/fsbuffer_bug_fix.md b/unreleased_history/bug_fixes/fsbuffer_bug_fix.md new file mode 100644 index 000000000..bec91bc4f --- /dev/null +++ b/unreleased_history/bug_fixes/fsbuffer_bug_fix.md @@ -0,0 +1 @@ +Fix use_after_free bug in async_io MultiReads when underlying FS enabled kFSBuffer. kFSBuffer is when underlying FS pass their own buffer instead of using RocksDB scratch in FSReadRequest. Right now it's an experimental feature. diff --git a/util/async_file_reader.cc b/util/async_file_reader.cc index 080c1ae96..9ce13b99f 100644 --- a/util/async_file_reader.cc +++ b/util/async_file_reader.cc @@ -26,6 +26,11 @@ bool AsyncFileReader::MultiReadAsyncImpl(ReadAwaiter* awaiter) { FSReadRequest* read_req = static_cast(cb_arg); read_req->status = req.status; read_req->result = req.result; + if (req.fs_scratch != nullptr) { + // TODO akanksha: Revisit to remove the const in the callback. + FSReadRequest& req_tmp = const_cast(req); + read_req->fs_scratch = std::move(req_tmp.fs_scratch); + } }, &awaiter->read_reqs_[i], &awaiter->io_handle_[i], &awaiter->del_fn_[i], /*aligned_buf=*/nullptr); From bd22a56d07af150badda34f26bb5391ed45d6fbe Mon Sep 17 00:00:00 2001 From: Andrew Kryczka Date: Sun, 6 Aug 2023 18:01:08 -0700 Subject: [PATCH 03/18] Avoid an std::map copy in persistent stats (#11681) Summary: An internal user reported this copy showing up in a CPU profile. We can use move instead. Pull Request resolved: https://github.com/facebook/rocksdb/pull/11681 Differential Revision: D48103170 Pulled By: ajkr fbshipit-source-id: 083d6470181a0041bb5275b657aa61bee23a3729 --- db/db_impl/db_impl.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/db/db_impl/db_impl.cc b/db/db_impl/db_impl.cc index 0b6d59207..5c260806a 100644 --- a/db/db_impl/db_impl.cc +++ b/db/db_impl/db_impl.cc @@ -1073,7 +1073,7 @@ void DBImpl::PersistStats() { "Storing %" ROCKSDB_PRIszt " stats with timestamp %" PRIu64 " to in-memory stats history", stats_slice_.size(), now_seconds); - stats_history_[now_seconds] = stats_delta; + stats_history_[now_seconds] = std::move(stats_delta); } stats_slice_initialized_ = true; std::swap(stats_slice_, stats_map); From e9376b4795965616ca768862dc1d5e88c5ab8135 Mon Sep 17 00:00:00 2001 From: Changyu Bi Date: Fri, 1 Sep 2023 09:34:08 -0700 Subject: [PATCH 04/18] Fix a bug where iterator status is not checked (#11782) Summary: This happens in (Compaction)MergingIterator layer, and can cause data loss during compaction or read/scan return incorrect result Pull Request resolved: https://github.com/facebook/rocksdb/pull/11782 Reviewed By: ajkr Differential Revision: D48880575 Pulled By: cbi42 fbshipit-source-id: 2294ad284a6d653d3674bebe55380f12ee4b645b --- table/compaction_merging_iterator.cc | 1 + table/merging_iterator.cc | 8 ++++++++ .../bug_fixes/001_check_iter_status_data_loss.md | 1 + 3 files changed, 10 insertions(+) create mode 100644 unreleased_history/bug_fixes/001_check_iter_status_data_loss.md diff --git a/table/compaction_merging_iterator.cc b/table/compaction_merging_iterator.cc index 2b41b2a93..d704a3222 100644 --- a/table/compaction_merging_iterator.cc +++ b/table/compaction_merging_iterator.cc @@ -338,6 +338,7 @@ CompactionMergingIteratorF(void)FindNextVisibleKey() { assert(current->iter.status().ok()); minHeap_.replace_top(current); } else { + considerStatus(current->iter.status()); minHeap_.pop(); } if (range_tombstone_iters_[current->level]) { diff --git a/table/merging_iterator.cc b/table/merging_iterator.cc index e0d15d8da..bfe8e3359 100644 --- a/table/merging_iterator.cc +++ b/table/merging_iterator.cc @@ -1239,6 +1239,7 @@ MergingIterMethod(bool)SkipNextDeleted() { InsertRangeTombstoneToMinHeap(current->level, true /* start_key */, true /* replace_top */); } else { + // TruncatedRangeDelIterator does not have status minHeap_.pop(); } return true /* current key deleted */; @@ -1296,6 +1297,9 @@ MergingIterMethod(bool)SkipNextDeleted() { assert(current->iter.status().ok()); UpdatePrefixCache(current); minHeap_.push(current); + } else { + // TODO(cbi): check status and early return if non-ok. + considerStatus(current->iter.status()); } // Invariants (rti) and (phi) if (range_tombstone_iters_[current->level] && @@ -1334,6 +1338,7 @@ MergingIterMethod(bool)SkipNextDeleted() { UpdatePrefixCache(current); minHeap_.replace_top(current); } else { + considerStatus(current->iter.status()); minHeap_.pop(); } return true /* current key deleted */; @@ -1507,6 +1512,8 @@ MergingIterMethod(bool)SkipPrevDeleted() { assert(current->iter.status().ok()); UpdatePrefixCache(current); maxHeap_->push(current); + } else { + considerStatus(current->iter.status()); } if (range_tombstone_iters_[current->level] && @@ -1549,6 +1556,7 @@ MergingIterMethod(bool)SkipPrevDeleted() { UpdatePrefixCache(current); maxHeap_->replace_top(current); } else { + considerStatus(current->iter.status()); maxHeap_->pop(); } return true /* current key deleted */; diff --git a/unreleased_history/bug_fixes/001_check_iter_status_data_loss.md b/unreleased_history/bug_fixes/001_check_iter_status_data_loss.md new file mode 100644 index 000000000..1cedc7215 --- /dev/null +++ b/unreleased_history/bug_fixes/001_check_iter_status_data_loss.md @@ -0,0 +1 @@ +* Fix a bug where if there is an error reading from offset 0 of a file from L1+ and that the file is not the first file in the sorted run, data can be lost in compaction and read/scan can return incorrect results. \ No newline at end of file From 997defd2a169a4f61d0af11baa17e84bc7ca801f Mon Sep 17 00:00:00 2001 From: jsteemann Date: Tue, 29 Aug 2023 18:40:13 -0700 Subject: [PATCH 05/18] remove a sub-condition that is always true (#11746) Summary: the value of `done` is always false here, so the sub-condition `!done` will always be true and the check can be removed. Pull Request resolved: https://github.com/facebook/rocksdb/pull/11746 Reviewed By: anand1976 Differential Revision: D48656845 Pulled By: ajkr fbshipit-source-id: 523ba3d07b3af7880c8c8ccb20442fd7c0f49417 --- db/memtable_list.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/db/memtable_list.cc b/db/memtable_list.cc index 58285cc6f..4cf8d2d47 100644 --- a/db/memtable_list.cc +++ b/db/memtable_list.cc @@ -184,7 +184,7 @@ bool MemTableListVersion::GetFromList( assert(*seq != kMaxSequenceNumber || s->IsNotFound()); return true; } - if (!done && !s->ok() && !s->IsMergeInProgress() && !s->IsNotFound()) { + if (!s->ok() && !s->IsMergeInProgress() && !s->IsNotFound()) { return false; } } From 765b5f6e4e1f968418c1623cfbee1563a565cdcd Mon Sep 17 00:00:00 2001 From: Peter Dillinger Date: Thu, 31 Aug 2023 08:39:09 -0700 Subject: [PATCH 06/18] Log host name (#11776) Summary: ... in info_log. Becoming more important with disaggregated storage. Pull Request resolved: https://github.com/facebook/rocksdb/pull/11776 Test Plan: manual Reviewed By: jaykorean Differential Revision: D48849471 Pulled By: pdillinger fbshipit-source-id: 9a8fd8b2564a4f133526ecd7c1414cb667e4ba54 --- db/db_info_dumper.cc | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/db/db_info_dumper.cc b/db/db_info_dumper.cc index be8d5bee1..7dd647955 100644 --- a/db/db_info_dumper.cc +++ b/db/db_info_dumper.cc @@ -34,6 +34,12 @@ void DumpDBFileSummary(const ImmutableDBOptions& options, std::string file_info, wal_info; Header(options.info_log, "DB SUMMARY\n"); + { + std::string hostname; + if (env->GetHostNameString(&hostname).ok()) { + Header(options.info_log, "Host name (Env): %s\n", hostname.c_str()); + } + } Header(options.info_log, "DB Session ID: %s\n", session_id.c_str()); Status s; From 4e4e28aed6e8c5b1fb805d920284151311944a39 Mon Sep 17 00:00:00 2001 From: darionyaphet Date: Mon, 9 Oct 2023 19:10:06 -0700 Subject: [PATCH 07/18] fix typo snapshto (#11817) Summary: Pull Request resolved: https://github.com/facebook/rocksdb/pull/11817 Reviewed By: jaykorean Differential Revision: D50103497 Pulled By: ltamasi fbshipit-source-id: 77c5cf86ff7eb5021fc91b03225882536163af7b --- db/db_impl/db_impl_compaction_flush.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/db/db_impl/db_impl_compaction_flush.cc b/db/db_impl/db_impl_compaction_flush.cc index 4c7f09d5d..eba77844d 100644 --- a/db/db_impl/db_impl_compaction_flush.cc +++ b/db/db_impl/db_impl_compaction_flush.cc @@ -197,7 +197,7 @@ Status DBImpl::FlushMemTableToOutputFile( // releases and re-acquires the db mutex. In the meantime, the application // can still insert into the memtables and increase the db's sequence number. // The application can take a snapshot, hoping that the latest visible state - // to this snapshto is preserved. This is hard to guarantee since db mutex + // to this snapshot is preserved. This is hard to guarantee since db mutex // not held. This newly-created snapshot is not included in `snapshot_seqs` // and the flush job is unaware of its presence. Consequently, the flush job // may drop certain keys when generating the L0, causing incorrect data to be From e44de04d380042aded01c77dfcc2005350539bd7 Mon Sep 17 00:00:00 2001 From: "Peter (Stig) Edwards" Date: Thu, 21 Sep 2023 13:52:01 -0700 Subject: [PATCH 08/18] Use *next_sequence -1 here (#11861) Summary: To fix off-by-one error: Transaction could not check for conflicts for operation at SequenceNumber 500000 as the MemTable only contains changes newer than SequenceNumber 500001. Fixes https://github.com/facebook/rocksdb/issues/11822 I think introduced in https://github.com/facebook/rocksdb/commit/a657ee9a9c4a2acb529b8f5567965e4bf6d38fd5 Pull Request resolved: https://github.com/facebook/rocksdb/pull/11861 Reviewed By: pdillinger Differential Revision: D49457273 Pulled By: ajkr fbshipit-source-id: b527cbae4ecc7874633a11f07027adee62940d74 --- db/db_impl/db_impl_open.cc | 2 +- .../optimistic_transaction_test.cc | 41 +++++++++++++++++++ 2 files changed, 42 insertions(+), 1 deletion(-) diff --git a/db/db_impl/db_impl_open.cc b/db/db_impl/db_impl_open.cc index eaa5775b6..818925291 100644 --- a/db/db_impl/db_impl_open.cc +++ b/db/db_impl/db_impl_open.cc @@ -1302,7 +1302,7 @@ Status DBImpl::RecoverLogFiles(const std::vector& wal_numbers, flushed = true; cfd->CreateNewMemtable(*cfd->GetLatestMutableCFOptions(), - *next_sequence); + *next_sequence - 1); } } } diff --git a/utilities/transactions/optimistic_transaction_test.cc b/utilities/transactions/optimistic_transaction_test.cc index 46d51956f..7f3d8ae33 100644 --- a/utilities/transactions/optimistic_transaction_test.cc +++ b/utilities/transactions/optimistic_transaction_test.cc @@ -1611,6 +1611,47 @@ TEST_P(OptimisticTransactionTest, SequenceNumberAfterRecoverTest) { delete transaction; } +#ifdef __SANITIZE_THREAD__ +// Skip OptimisticTransactionTest.SequenceNumberAfterRecoverLargeTest under TSAN +// to avoid false positive because of TSAN lock limit of 64. +#else +TEST_P(OptimisticTransactionTest, SequenceNumberAfterRecoverLargeTest) { + WriteOptions write_options; + OptimisticTransactionOptions transaction_options; + + Transaction* transaction( + txn_db->BeginTransaction(write_options, transaction_options)); + + std::string value(1024 * 1024, 'X'); + const size_t n_zero = 2; + std::string s_i; + Status s; + for (int i = 1; i <= 64; i++) { + s_i = std::to_string(i); + auto key = std::string(n_zero - std::min(n_zero, s_i.length()), '0') + s_i; + s = transaction->Put(key, value); + ASSERT_OK(s); + } + + s = transaction->Commit(); + ASSERT_OK(s); + delete transaction; + + Reopen(); + transaction = txn_db->BeginTransaction(write_options, transaction_options); + s = transaction->Put("bar", "val"); + ASSERT_OK(s); + s = transaction->Commit(); + if (!s.ok()) { + std::cerr << "Failed to commit records. Error: " << s.ToString() + << std::endl; + } + ASSERT_OK(s); + + delete transaction; +} +#endif // __SANITIZE_THREAD__ + TEST_P(OptimisticTransactionTest, TimestampedSnapshotMissingCommitTs) { std::unique_ptr txn(txn_db->BeginTransaction(WriteOptions())); ASSERT_OK(txn->Put("a", "v")); From e467f0392a3a97ba0843feb8109fa060592a1d81 Mon Sep 17 00:00:00 2001 From: Changyu Bi Date: Mon, 6 Nov 2023 07:41:36 -0800 Subject: [PATCH 09/18] Add missing status check in ExternalSstFileIngestionJob and ImportColumnFamilyJob (#12042) Summary: .. and update some unit tests that failed with this change. See comment in ExternalSSTFileBasicTest.IngestFileWithCorruptedDataBlock for more explanation. The missing status check is not caught by `ASSERT_STATUS_CHECKED=1` due to this line: https://github.com/facebook/rocksdb/blob/8505b26db19871a8c8782a35a7b5be9d321d45e0/table/block_based/block.h#L394. Will explore if we can remove it. Pull Request resolved: https://github.com/facebook/rocksdb/pull/12042 Test Plan: existing unit tests. Reviewed By: ajkr Differential Revision: D50994769 Pulled By: cbi42 fbshipit-source-id: c91615bccd6094a91634c50b98401d456cbb927b --- db/external_sst_file_basic_test.cc | 35 ++++++++++++++++----------- db/external_sst_file_ingestion_job.cc | 6 +++-- db/import_column_family_job.cc | 2 ++ 3 files changed, 27 insertions(+), 16 deletions(-) diff --git a/db/external_sst_file_basic_test.cc b/db/external_sst_file_basic_test.cc index 39334994a..602e57c60 100644 --- a/db/external_sst_file_basic_test.cc +++ b/db/external_sst_file_basic_test.cc @@ -1548,6 +1548,11 @@ TEST_F(ExternalSSTFileBasicTest, RangeDeletionEndComesBeforeStart) { } TEST_P(ExternalSSTFileBasicTest, IngestFileWithBadBlockChecksum) { + bool verify_checksums_before_ingest = std::get<1>(GetParam()); + if (!verify_checksums_before_ingest) { + ROCKSDB_GTEST_BYPASS("Bypassing test when !verify_checksums_before_ingest"); + return; + } bool change_checksum_called = false; const auto& change_checksum = [&](void* arg) { if (!change_checksum_called) { @@ -1565,24 +1570,20 @@ TEST_P(ExternalSSTFileBasicTest, IngestFileWithBadBlockChecksum) { SyncPoint::GetInstance()->EnableProcessing(); int file_id = 0; bool write_global_seqno = std::get<0>(GetParam()); - bool verify_checksums_before_ingest = std::get<1>(GetParam()); do { Options options = CurrentOptions(); DestroyAndReopen(options); std::map true_data; Status s = GenerateAndAddExternalFile( options, {1, 2, 3, 4, 5, 6}, ValueType::kTypeValue, file_id++, - write_global_seqno, verify_checksums_before_ingest, &true_data); - if (verify_checksums_before_ingest) { - ASSERT_NOK(s); - } else { - ASSERT_OK(s); - } + write_global_seqno, /*verify_checksums_before_ingest=*/true, + &true_data); + ASSERT_NOK(s); change_checksum_called = false; } while (ChangeOptionsForFileIngestionTest()); } -TEST_P(ExternalSSTFileBasicTest, IngestFileWithFirstByteTampered) { +TEST_P(ExternalSSTFileBasicTest, IngestFileWithCorruptedDataBlock) { if (!random_rwfile_supported_) { ROCKSDB_GTEST_SKIP("Test requires NewRandomRWFile support"); return; @@ -1590,15 +1591,21 @@ TEST_P(ExternalSSTFileBasicTest, IngestFileWithFirstByteTampered) { SyncPoint::GetInstance()->DisableProcessing(); int file_id = 0; EnvOptions env_options; + Random rnd(301); do { Options options = CurrentOptions(); + options.compression = kNoCompression; + BlockBasedTableOptions table_options; + table_options.block_size = 4 * 1024; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); std::string file_path = sst_files_dir_ + std::to_string(file_id++); SstFileWriter sst_file_writer(env_options, options); Status s = sst_file_writer.Open(file_path); ASSERT_OK(s); + // This should write more than 2 data blocks. for (int i = 0; i != 100; ++i) { std::string key = Key(i); - std::string value = Key(i) + std::to_string(0); + std::string value = rnd.RandomString(200); ASSERT_OK(sst_file_writer.Put(key, value)); } ASSERT_OK(sst_file_writer.Finish()); @@ -1609,11 +1616,11 @@ TEST_P(ExternalSSTFileBasicTest, IngestFileWithFirstByteTampered) { ASSERT_GT(file_size, 8); std::unique_ptr rwfile; ASSERT_OK(env_->NewRandomRWFile(file_path, &rwfile, EnvOptions())); - // Manually corrupt the file - // We deterministically corrupt the first byte because we currently - // cannot choose a random offset. The reason for this limitation is that - // we do not checksum property block at present. - const uint64_t offset = 0; + // Corrupt the second data block. + // We need to corrupt a non-first and non-last data block + // since we access them to get smallest and largest internal + // key in the file in GetIngestedFileInfo(). + const uint64_t offset = 5000; char scratch[8] = {0}; Slice buf; ASSERT_OK(rwfile->Read(offset, sizeof(scratch), &buf, scratch)); diff --git a/db/external_sst_file_ingestion_job.cc b/db/external_sst_file_ingestion_job.cc index f95a436b4..7b9990826 100644 --- a/db/external_sst_file_ingestion_job.cc +++ b/db/external_sst_file_ingestion_job.cc @@ -786,8 +786,6 @@ Status ExternalSstFileIngestionJob::GetIngestedFileInfo( std::unique_ptr iter(table_reader->NewIterator( ro, sv->mutable_cf_options.prefix_extractor.get(), /*arena=*/nullptr, /*skip_filters=*/false, TableReaderCaller::kExternalSSTIngestion)); - std::unique_ptr range_del_iter( - table_reader->NewRangeTombstoneIterator(ro)); // Get first (smallest) and last (largest) key from file. file_to_ingest->smallest_internal_key = @@ -821,8 +819,12 @@ Status ExternalSstFileIngestionJob::GetIngestedFileInfo( file_to_ingest->largest_internal_key.SetFrom(key); bounds_set = true; + } else if (!iter->status().ok()) { + return iter->status(); } + std::unique_ptr range_del_iter( + table_reader->NewRangeTombstoneIterator(ro)); // We may need to adjust these key bounds, depending on whether any range // deletion tombstones extend past them. const Comparator* ucmp = cfd_->internal_comparator().user_comparator(); diff --git a/db/import_column_family_job.cc b/db/import_column_family_job.cc index c6c9099bc..9e9f4094b 100644 --- a/db/import_column_family_job.cc +++ b/db/import_column_family_job.cc @@ -362,6 +362,8 @@ Status ImportColumnFamilyJob::GetIngestedFileInfo( iter->SeekToLast(); file_to_import->largest_internal_key.DecodeFrom(iter->key()); bound_set = true; + } else if (!iter->status().ok()) { + return iter->status(); } std::unique_ptr range_del_iter{ From 69df2fecb130cb002f103d332dd1976e972a0033 Mon Sep 17 00:00:00 2001 From: leipeng Date: Tue, 19 Dec 2023 00:21:27 +0800 Subject: [PATCH 10/18] LookupKey::LookupKey: UNLIKELY(nullptr != ts) --- db/dbformat.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/db/dbformat.cc b/db/dbformat.cc index df0786e41..59a093e9f 100644 --- a/db/dbformat.cc +++ b/db/dbformat.cc @@ -217,7 +217,7 @@ LookupKey::LookupKey(const Slice& _user_key, SequenceNumber s, memcpy(dst, buf, klen_len); dst += klen_len; memcpy(dst, _user_key.data(), usize); dst += usize; - if (nullptr != ts) { + if (UNLIKELY(nullptr != ts)) { memcpy(dst, ts->data(), ts_sz); dst += ts_sz; } From a854d11bb75f99b2668dfc5bca9e28c8f5124c18 Mon Sep 17 00:00:00 2001 From: leipeng Date: Tue, 19 Dec 2023 23:21:55 +0800 Subject: [PATCH 11/18] db/builder.cc: match(%d:%lld.sst) --- db/builder.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/db/builder.cc b/db/builder.cc index b1bdfc4e7..74673dbc9 100644 --- a/db/builder.cc +++ b/db/builder.cc @@ -397,7 +397,7 @@ Status BuildTable( if (s.ok() && !output_validator.CompareValidator(file_validator)) { #if !defined(ROCKSDB_UNIT_TEST) auto& fd = meta->fd; - ROCKSDB_DIE("BuildTable: Paranoid checksums do not match(%d/%lld.sst)", + ROCKSDB_DIE("BuildTable: Paranoid checksums do not match(%d:%lld.sst)", fd.GetPathId(), (long long)fd.GetNumber()); #else s = Status::Corruption("BuildTable: Paranoid checksums do not match"); From 5bf3d11e981cae43eaaf5ef0a08fefefd1ccf22c Mon Sep 17 00:00:00 2001 From: leipeng Date: Wed, 20 Dec 2023 09:56:57 +0800 Subject: [PATCH 12/18] OutputValidator: Improve full check performance --- db/output_validator.cc | 22 +++++++++++++++------- db/output_validator.h | 4 +++- 2 files changed, 18 insertions(+), 8 deletions(-) diff --git a/db/output_validator.cc b/db/output_validator.cc index 6402de435..673985204 100644 --- a/db/output_validator.cc +++ b/db/output_validator.cc @@ -14,6 +14,11 @@ namespace ROCKSDB_NAMESPACE { static bool g_full_check = terark::getEnvBool("OutputValidator_full_check"); void OutputValidator::Init() { + full_check_ = g_full_check; + if (full_check_) { + std::destroy_at(&kv_vec_); + new(&kv_vec_)decltype(kv_vec_)(terark::valvec_reserve(), 128<<10, 32<<20); + } if (icmp_.IsForwardBytewise()) m_add = &OutputValidator::Add_tpl; else if (icmp_.IsReverseBytewise()) @@ -48,20 +53,23 @@ Status OutputValidator::Add_tpl(const Slice key, const Slice value) { memcpy(prev_key_.data(), key.data(), key.size()); #endif } - if (g_full_check) { - kv_vec_.emplace_back(key.ToString(), value.ToString()); + if (full_check_) { + kv_vec_.push_back(key); + kv_vec_.push_back(value); } return Status::OK(); } +static inline Slice SliceOf(terark::fstring s) { return {s.p, s.size()}; } bool OutputValidator::CompareValidator(const OutputValidator& other) { - if (g_full_check) { + if (full_check_) { long long file_number = m_file_number ? m_file_number : other.m_file_number; ROCKSDB_VERIFY_EQ(kv_vec_.size(), other.kv_vec_.size()); - for (size_t i = 0, n = kv_vec_.size(); i < n; i++) { - #define hex(deref, field) ParsedInternalKey(deref kv_vec_[i].field).DebugString(true, true).c_str() - ROCKSDB_VERIFY_F(kv_vec_[i].first == other.kv_vec_[i].first , "%06lld.sst[%zd]: %s %s", file_number, i, hex(,first ), hex(other., first )); - ROCKSDB_VERIFY_F(kv_vec_[i].second == other.kv_vec_[i].second, "%06lld.sst[%zd]: %s %s", file_number, i, hex(,second), hex(other., second)); + for (size_t i = 0, n = kv_vec_.size() / 2; i < n; i++) { + #define hex(deref, field) ParsedInternalKey(SliceOf(deref kv_vec_[field])).DebugString(true, true).c_str() + size_t key = 2*i + 0, val = 2*i + 1; + ROCKSDB_VERIFY_F(kv_vec_[key] == other.kv_vec_[key], "%06lld.sst[%zd]: %s %s", file_number, i, hex(,key), hex(other.,key)); + ROCKSDB_VERIFY_F(kv_vec_[val] == other.kv_vec_[val], "%06lld.sst[%zd]: %s %s", file_number, i, hex(,val), hex(other.,val)); } ROCKSDB_VERIFY_EQ(GetHash(), other.GetHash()); } diff --git a/db/output_validator.h b/db/output_validator.h index 0baf4a9ef..3d1d2cf2f 100644 --- a/db/output_validator.h +++ b/db/output_validator.h @@ -8,6 +8,7 @@ #include "rocksdb/slice.h" #include "rocksdb/status.h" #include +#include namespace ROCKSDB_NAMESPACE { // A class that validates key/value that is inserted to an SST file. @@ -53,6 +54,7 @@ class OutputValidator { uint64_t paranoid_hash_ = 0; bool enable_order_check_; bool enable_hash_; - std::vector > kv_vec_; + bool full_check_ = false; + terark::fstrvecll kv_vec_{terark::valvec_no_init()}; }; } // namespace ROCKSDB_NAMESPACE From 97f3e9fb35d4d68b9b3964a3cd12e8e53a991fcb Mon Sep 17 00:00:00 2001 From: leipeng Date: Wed, 20 Dec 2023 15:09:17 +0800 Subject: [PATCH 13/18] OutputValidator: Improve full check performance -- more improve --- db/output_validator.cc | 42 ++++++++++++++++++++++++++++++++---------- db/output_validator.h | 5 +++-- 2 files changed, 35 insertions(+), 12 deletions(-) diff --git a/db/output_validator.cc b/db/output_validator.cc index 673985204..bde2f4190 100644 --- a/db/output_validator.cc +++ b/db/output_validator.cc @@ -8,16 +8,17 @@ #include "test_util/sync_point.h" #include "util/hash.h" #include +#include namespace ROCKSDB_NAMESPACE { +using terark::fstring; static bool g_full_check = terark::getEnvBool("OutputValidator_full_check"); void OutputValidator::Init() { full_check_ = g_full_check; if (full_check_) { - std::destroy_at(&kv_vec_); - new(&kv_vec_)decltype(kv_vec_)(terark::valvec_reserve(), 128<<10, 32<<20); + kv_vec_.reserve(32 << 20); // 32M } if (icmp_.IsForwardBytewise()) m_add = &OutputValidator::Add_tpl; @@ -54,23 +55,44 @@ Status OutputValidator::Add_tpl(const Slice key, const Slice value) { #endif } if (full_check_) { - kv_vec_.push_back(key); - kv_vec_.push_back(value); + auto WriteSlice = [this](Slice s) { + unsigned char buf[16]; + size_t len = terark::save_var_uint64(buf, s.size_) - buf; + kv_vec_.append(buf, len); + kv_vec_.append(s.data_, s.size_); + }; + WriteSlice(key); + WriteSlice(value); } + num_kv_++; return Status::OK(); } -static inline Slice SliceOf(terark::fstring s) { return {s.p, s.size()}; } +static Slice ReadSlice(const unsigned char** ptr) { + size_t len = (size_t)terark::load_var_uint64(*ptr, ptr); + auto data = (const char*)(*ptr); + *ptr += len; + return Slice(data, len); +} + bool OutputValidator::CompareValidator(const OutputValidator& other) { if (full_check_) { long long file_number = m_file_number ? m_file_number : other.m_file_number; ROCKSDB_VERIFY_EQ(kv_vec_.size(), other.kv_vec_.size()); - for (size_t i = 0, n = kv_vec_.size() / 2; i < n; i++) { - #define hex(deref, field) ParsedInternalKey(SliceOf(deref kv_vec_[field])).DebugString(true, true).c_str() - size_t key = 2*i + 0, val = 2*i + 1; - ROCKSDB_VERIFY_F(kv_vec_[key] == other.kv_vec_[key], "%06lld.sst[%zd]: %s %s", file_number, i, hex(,key), hex(other.,key)); - ROCKSDB_VERIFY_F(kv_vec_[val] == other.kv_vec_[val], "%06lld.sst[%zd]: %s %s", file_number, i, hex(,val), hex(other.,val)); + ROCKSDB_VERIFY_EQ(num_kv_, other.num_kv_); + const unsigned char* x_reader = kv_vec_.begin(); + const unsigned char* y_reader = other.kv_vec_.begin(); + for (size_t i = 0, n = num_kv_; i < n; i++) { + Slice kx = ReadSlice(&x_reader); + Slice vx = ReadSlice(&x_reader); + Slice ky = ReadSlice(&y_reader); + Slice vy = ReadSlice(&y_reader); + #define HexKey(key) ParsedInternalKey(key).DebugString(true, true).c_str() + ROCKSDB_VERIFY_F(kx == ky, "%06lld.sst[%zd]: %s %s", file_number, i, HexKey(kx), HexKey(ky)); + ROCKSDB_VERIFY_F(vx == vy, "%06lld.sst[%zd]: %s %s", file_number, i, vx.hex().c_str(), vy.hex().c_str()); } + ROCKSDB_VERIFY_EQ(x_reader, kv_vec_.end()); + ROCKSDB_VERIFY_EQ(y_reader, other.kv_vec_.end()); ROCKSDB_VERIFY_EQ(GetHash(), other.GetHash()); } return GetHash() == other.GetHash(); diff --git a/db/output_validator.h b/db/output_validator.h index 3d1d2cf2f..d33195770 100644 --- a/db/output_validator.h +++ b/db/output_validator.h @@ -7,8 +7,8 @@ #include "db/dbformat.h" #include "rocksdb/slice.h" #include "rocksdb/status.h" +#include #include -#include namespace ROCKSDB_NAMESPACE { // A class that validates key/value that is inserted to an SST file. @@ -55,6 +55,7 @@ class OutputValidator { bool enable_order_check_; bool enable_hash_; bool full_check_ = false; - terark::fstrvecll kv_vec_{terark::valvec_no_init()}; + size_t num_kv_ = 0; + terark::valvec kv_vec_; }; } // namespace ROCKSDB_NAMESPACE From a2fc33c9254e8b99565dd80d4220799cc3fdb58b Mon Sep 17 00:00:00 2001 From: leipeng Date: Wed, 20 Dec 2023 23:26:16 +0800 Subject: [PATCH 14/18] OutputValidator: more debug info --- db/output_validator.cc | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/db/output_validator.cc b/db/output_validator.cc index bde2f4190..51fa9b561 100644 --- a/db/output_validator.cc +++ b/db/output_validator.cc @@ -78,7 +78,11 @@ static Slice ReadSlice(const unsigned char** ptr) { bool OutputValidator::CompareValidator(const OutputValidator& other) { if (full_check_) { long long file_number = m_file_number ? m_file_number : other.m_file_number; - ROCKSDB_VERIFY_EQ(kv_vec_.size(), other.kv_vec_.size()); + if (kv_vec_.size() != other.kv_vec_.size()) { + fprintf(stderr, + "FATAL: OutputValidator::CompareValidator: kv_vec_.size: %zd != %zd\n", + kv_vec_.size(), other.kv_vec_.size()); + } ROCKSDB_VERIFY_EQ(num_kv_, other.num_kv_); const unsigned char* x_reader = kv_vec_.begin(); const unsigned char* y_reader = other.kv_vec_.begin(); From ffd30868da34e6672ed5e6020451b90bcfd3ff76 Mon Sep 17 00:00:00 2001 From: leipeng Date: Thu, 21 Dec 2023 11:19:40 +0800 Subject: [PATCH 15/18] OutputValidator: more debug info - 2 --- db/output_validator.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/db/output_validator.cc b/db/output_validator.cc index 51fa9b561..86cd38974 100644 --- a/db/output_validator.cc +++ b/db/output_validator.cc @@ -92,8 +92,8 @@ bool OutputValidator::CompareValidator(const OutputValidator& other) { Slice ky = ReadSlice(&y_reader); Slice vy = ReadSlice(&y_reader); #define HexKey(key) ParsedInternalKey(key).DebugString(true, true).c_str() - ROCKSDB_VERIFY_F(kx == ky, "%06lld.sst[%zd]: %s %s", file_number, i, HexKey(kx), HexKey(ky)); - ROCKSDB_VERIFY_F(vx == vy, "%06lld.sst[%zd]: %s %s", file_number, i, vx.hex().c_str(), vy.hex().c_str()); + ROCKSDB_VERIFY_F(kx == ky, "%06lld.sst[%zd]: %zd(%s) %zd(%s)", file_number, i, kx.size_, HexKey(kx), ky.size_, HexKey(ky)); + ROCKSDB_VERIFY_F(vx == vy, "%06lld.sst[%zd]: %zd(%s) %zd(%s)", file_number, i, vx.size_, vx.hex().c_str(), vy.size_, vy.hex().c_str()); } ROCKSDB_VERIFY_EQ(x_reader, kv_vec_.end()); ROCKSDB_VERIFY_EQ(y_reader, other.kv_vec_.end()); From 8943a4dc8cc96b3356978ecc1e828e9f76bfa4e0 Mon Sep 17 00:00:00 2001 From: leipeng Date: Thu, 21 Dec 2023 22:41:05 +0800 Subject: [PATCH 16/18] Pretty InternalStats::DumpDBStatsWriteStall/DumpCFStatsWriteStall --- db/internal_stats.cc | 61 +++++++++++++++++++++++++------------------- 1 file changed, 35 insertions(+), 26 deletions(-) diff --git a/db/internal_stats.cc b/db/internal_stats.cc index e3fd47934..e64721159 100644 --- a/db/internal_stats.cc +++ b/db/internal_stats.cc @@ -31,6 +31,11 @@ #include "util/hash_containers.h" #include "util/string_util.h" +#if defined(__GNUC__) +#pragma GCC diagnostic ignored "-Wnonnull" // for boost::replace_all_copy +#endif +#include + namespace ROCKSDB_NAMESPACE { @@ -1729,6 +1734,34 @@ void InternalStats::DumpDBMapStatsWriteStall( } } +static void DumpWriteStalls(std::ostringstream& str, + std::map& stats_map) { + str << "Write Stall (count): "; + + for (auto iter = stats_map.begin(); iter != stats_map.end(); ) { + std::string name = boost::replace_all_copy(iter->first, "-delays", ""); + str << name << ": delays " << iter->second; + ++iter; + if (stats_map.end() == iter) { + break; // should not goes here, check for safe + } + std::string name2 = boost::replace_all_copy(iter->first, "-stops", ""); + if (name2 == name) { + str << ", stops " << iter->second; + } + else { // should not goes here + str << iter->first << ": " << iter->second; + } + auto next = std::next(iter); + if (stats_map.end() == next) { + str << "\n"; + } else { + str << " | "; + } + iter = next; + } +} + void InternalStats::DumpDBStatsWriteStall(std::string* value) { assert(value); @@ -1736,19 +1769,7 @@ void InternalStats::DumpDBStatsWriteStall(std::string* value) { DumpDBMapStatsWriteStall(&write_stall_stats_map); std::ostringstream str; - str << "Write Stall (count): "; - - for (auto write_stall_stats_map_iter = write_stall_stats_map.begin(); - write_stall_stats_map_iter != write_stall_stats_map.end(); - write_stall_stats_map_iter++) { - const auto& name_and_stat = *write_stall_stats_map_iter; - str << name_and_stat.first << ": " << name_and_stat.second; - if (std::next(write_stall_stats_map_iter) == write_stall_stats_map.end()) { - str << "\n"; - } else { - str << ", "; - } - } + DumpWriteStalls(str, write_stall_stats_map); *value = str.str(); } @@ -1935,19 +1956,7 @@ void InternalStats::DumpCFStatsWriteStall(std::string* value, DumpCFMapStatsWriteStall(&write_stall_stats_map); std::ostringstream str; - str << "Write Stall (count): "; - - for (auto write_stall_stats_map_iter = write_stall_stats_map.begin(); - write_stall_stats_map_iter != write_stall_stats_map.end(); - write_stall_stats_map_iter++) { - const auto& name_and_stat = *write_stall_stats_map_iter; - str << name_and_stat.first << ": " << name_and_stat.second; - if (std::next(write_stall_stats_map_iter) == write_stall_stats_map.end()) { - str << "\n"; - } else { - str << ", "; - } - } + DumpWriteStalls(str, write_stall_stats_map); if (total_stall_count) { *total_stall_count = From 3d1a61a9642b3392721f35b82c0927d0065b1821 Mon Sep 17 00:00:00 2001 From: leipeng Date: Fri, 22 Dec 2023 11:01:45 +0800 Subject: [PATCH 17/18] Add CFOptions::allow_merge_memtables, default true for conform RocksDB --- db/memtable.cc | 1 + db/memtable.h | 1 + db/memtable_list.cc | 3 +++ db_stress_tool/db_stress_common.h | 1 + db_stress_tool/db_stress_gflags.cc | 4 ++++ include/rocksdb/advanced_options.h | 3 +++ options/cf_options.cc | 7 +++++++ options/cf_options.h | 3 +++ options/options.cc | 4 ++++ options/options_helper.cc | 1 + options/options_settable_test.cc | 1 + sideplugin/rockside | 2 +- tools/db_bench_tool.cc | 3 +++ 13 files changed, 33 insertions(+), 1 deletion(-) diff --git a/db/memtable.cc b/db/memtable.cc index c0cbb6bd5..9349d3ed4 100644 --- a/db/memtable.cc +++ b/db/memtable.cc @@ -55,6 +55,7 @@ ImmutableMemTableOptions::ImmutableMemTableOptions( mutable_cf_options.memtable_prefix_bloom_size_ratio) * 8u), memtable_huge_page_size(mutable_cf_options.memtable_huge_page_size), + allow_merge_memtables(mutable_cf_options.allow_merge_memtables), memtable_whole_key_filtering( mutable_cf_options.memtable_whole_key_filtering), inplace_update_support(ioptions.inplace_update_support), diff --git a/db/memtable.h b/db/memtable.h index b1c1ad184..f0a798315 100644 --- a/db/memtable.h +++ b/db/memtable.h @@ -47,6 +47,7 @@ struct ImmutableMemTableOptions { size_t arena_block_size; uint32_t memtable_prefix_bloom_bits; size_t memtable_huge_page_size; + bool allow_merge_memtables; bool memtable_whole_key_filtering; bool inplace_update_support; size_t inplace_update_num_locks; diff --git a/db/memtable_list.cc b/db/memtable_list.cc index 4cf8d2d47..3a7e15655 100644 --- a/db/memtable_list.cc +++ b/db/memtable_list.cc @@ -419,6 +419,9 @@ void MemTableList::PickMemtablesToFlush(uint64_t max_memtable_id, std::max(m->GetNextLogNumber(), *max_next_log_number); } ret->push_back(m); + if (!m->GetImmutableMemTableOptions()->allow_merge_memtables) { + break; + } } else if (!ret->empty()) { // This `break` is necessary to prevent picking non-consecutive memtables // in case `memlist` has one or more entries with diff --git a/db_stress_tool/db_stress_common.h b/db_stress_tool/db_stress_common.h index c0b1e6fd2..aceb28f28 100644 --- a/db_stress_tool/db_stress_common.h +++ b/db_stress_tool/db_stress_common.h @@ -108,6 +108,7 @@ DECLARE_int32(min_write_buffer_number_to_merge); DECLARE_int32(max_write_buffer_number_to_maintain); DECLARE_int64(max_write_buffer_size_to_maintain); DECLARE_double(memtable_prefix_bloom_size_ratio); +DECLARE_bool(allow_merge_memtables); DECLARE_bool(memtable_whole_key_filtering); DECLARE_int32(open_files); DECLARE_int64(compressed_cache_size); diff --git a/db_stress_tool/db_stress_gflags.cc b/db_stress_tool/db_stress_gflags.cc index df4c3be0b..0657cbbbc 100644 --- a/db_stress_tool/db_stress_gflags.cc +++ b/db_stress_tool/db_stress_gflags.cc @@ -190,6 +190,10 @@ DEFINE_double(memtable_prefix_bloom_size_ratio, "creates prefix blooms for memtables, each with size " "`write_buffer_size * memtable_prefix_bloom_size_ratio`."); +DEFINE_bool(allow_merge_memtables, + ROCKSDB_NAMESPACE::Options().allow_merge_memtables, + "allow merge memtables on flush."); + DEFINE_bool(memtable_whole_key_filtering, ROCKSDB_NAMESPACE::Options().memtable_whole_key_filtering, "Enable whole key filtering in memtables."); diff --git a/include/rocksdb/advanced_options.h b/include/rocksdb/advanced_options.h index 58171ecbc..10d4ffdb4 100644 --- a/include/rocksdb/advanced_options.h +++ b/include/rocksdb/advanced_options.h @@ -463,6 +463,9 @@ struct AdvancedColumnFamilyOptions { // Dynamically changeable through SetOptions() API double memtable_prefix_bloom_size_ratio = 0.0; + // set as false to avoid flush output large SSTs + bool allow_merge_memtables = true; + // Enable whole key bloom filter in memtable. Note this will only take effect // if memtable_prefix_bloom_size_ratio is not 0. Enabling whole key filtering // can potentially reduce CPU usage for point-look-ups. diff --git a/options/cf_options.cc b/options/cf_options.cc index d665ec5d1..75221a4e2 100644 --- a/options/cf_options.cc +++ b/options/cf_options.cc @@ -360,6 +360,10 @@ static std::unordered_map {"memtable_prefix_bloom_probes", {0, OptionType::kUInt32T, OptionVerificationType::kDeprecated, OptionTypeFlags::kMutable}}, + {"allow_merge_memtables", + {offsetof(struct MutableCFOptions, allow_merge_memtables), + OptionType::kBoolean, OptionVerificationType::kNormal, + OptionTypeFlags::kMutable}}, {"memtable_whole_key_filtering", {offsetof(struct MutableCFOptions, memtable_whole_key_filtering), OptionType::kBoolean, OptionVerificationType::kNormal, @@ -1048,6 +1052,9 @@ void MutableCFOptions::Dump(Logger* log) const { arena_block_size); ROCKS_LOG_INFO(log, " memtable_prefix_bloom_ratio: %f", memtable_prefix_bloom_size_ratio); + + ROCKS_LOG_INFO(log, " allow_merge_memtables: %d", + allow_merge_memtables); ROCKS_LOG_INFO(log, " memtable_whole_key_filtering: %d", memtable_whole_key_filtering); ROCKS_LOG_INFO(log, diff --git a/options/cf_options.h b/options/cf_options.h index 720ae8148..58bb79533 100644 --- a/options/cf_options.h +++ b/options/cf_options.h @@ -118,6 +118,7 @@ struct MutableCFOptions { arena_block_size(options.arena_block_size), memtable_prefix_bloom_size_ratio( options.memtable_prefix_bloom_size_ratio), + allow_merge_memtables(options.allow_merge_memtables), memtable_whole_key_filtering(options.memtable_whole_key_filtering), memtable_huge_page_size(options.memtable_huge_page_size), max_successive_merges(options.max_successive_merges), @@ -188,6 +189,7 @@ struct MutableCFOptions { max_write_buffer_number(0), arena_block_size(0), memtable_prefix_bloom_size_ratio(0), + allow_merge_memtables(true), memtable_whole_key_filtering(false), memtable_huge_page_size(0), max_successive_merges(0), @@ -255,6 +257,7 @@ struct MutableCFOptions { int max_write_buffer_number; size_t arena_block_size; double memtable_prefix_bloom_size_ratio; + bool allow_merge_memtables; bool memtable_whole_key_filtering; size_t memtable_huge_page_size; size_t max_successive_merges; diff --git a/options/options.cc b/options/options.cc index 954b28457..d0c92f81b 100644 --- a/options/options.cc +++ b/options/options.cc @@ -55,6 +55,7 @@ AdvancedColumnFamilyOptions::AdvancedColumnFamilyOptions(const Options& options) inplace_callback(options.inplace_callback), memtable_prefix_bloom_size_ratio( options.memtable_prefix_bloom_size_ratio), + allow_merge_memtables(options.allow_merge_memtables), memtable_whole_key_filtering(options.memtable_whole_key_filtering), memtable_huge_page_size(options.memtable_huge_page_size), memtable_insert_with_hint_prefix_extractor( @@ -396,6 +397,9 @@ void ColumnFamilyOptions::Dump(Logger* log) const { ROCKS_LOG_HEADER( log, " Options.memtable_prefix_bloom_size_ratio: %f", memtable_prefix_bloom_size_ratio); + ROCKS_LOG_HEADER(log, + " Options.allow_merge_memtables: %d", + allow_merge_memtables); ROCKS_LOG_HEADER(log, " Options.memtable_whole_key_filtering: %d", memtable_whole_key_filtering); diff --git a/options/options_helper.cc b/options/options_helper.cc index ee3b16971..c76fd96c8 100644 --- a/options/options_helper.cc +++ b/options/options_helper.cc @@ -200,6 +200,7 @@ void UpdateColumnFamilyOptions(const MutableCFOptions& moptions, cf_opts->arena_block_size = moptions.arena_block_size; cf_opts->memtable_prefix_bloom_size_ratio = moptions.memtable_prefix_bloom_size_ratio; + cf_opts->allow_merge_memtables = moptions.allow_merge_memtables; cf_opts->memtable_whole_key_filtering = moptions.memtable_whole_key_filtering; cf_opts->memtable_huge_page_size = moptions.memtable_huge_page_size; cf_opts->max_successive_merges = moptions.max_successive_merges; diff --git a/options/options_settable_test.cc b/options/options_settable_test.cc index 95754836f..5c297e568 100644 --- a/options/options_settable_test.cc +++ b/options/options_settable_test.cc @@ -530,6 +530,7 @@ TEST_F(OptionsSettableTest, ColumnFamilyOptionsAllFieldsSettable) { "merge_operator=aabcxehazrMergeOperator;" "memtable_prefix_bloom_size_ratio=0.4642;" "memtable_whole_key_filtering=true;" + "allow_merge_memtables=true;" "memtable_insert_with_hint_prefix_extractor=rocksdb.CappedPrefix.13;" "check_flush_compaction_key_order=false;" "paranoid_file_checks=true;" diff --git a/sideplugin/rockside b/sideplugin/rockside index 959b7a4de..8142146b2 160000 --- a/sideplugin/rockside +++ b/sideplugin/rockside @@ -1 +1 @@ -Subproject commit 959b7a4de941504b2efb95faed498ccd14e7e38e +Subproject commit 8142146b2f384a78bfb2595a48b7080720855617 diff --git a/tools/db_bench_tool.cc b/tools/db_bench_tool.cc index 9f534a2eb..af81666d9 100644 --- a/tools/db_bench_tool.cc +++ b/tools/db_bench_tool.cc @@ -746,6 +746,8 @@ DEFINE_bool(use_ribbon_filter, false, "Use Ribbon instead of Bloom filter"); DEFINE_double(memtable_bloom_size_ratio, 0, "Ratio of memtable size used for bloom filter. 0 means no bloom " "filter."); +DEFINE_bool(allow_merge_memtables, true, + "allow merge memtables on flush."); DEFINE_bool(memtable_whole_key_filtering, false, "Try to use whole key bloom filter in memtables."); DEFINE_bool(memtable_use_huge_page, false, @@ -4190,6 +4192,7 @@ class Benchmark { } options.memtable_huge_page_size = FLAGS_memtable_use_huge_page ? 2048 : 0; options.memtable_prefix_bloom_size_ratio = FLAGS_memtable_bloom_size_ratio; + options.allow_merge_memtables = FLAGS_allow_merge_memtables; options.memtable_whole_key_filtering = FLAGS_memtable_whole_key_filtering; if (FLAGS_memtable_insert_with_hint_prefix_size > 0) { options.memtable_insert_with_hint_prefix_extractor.reset( From 8db0b8da3673fbd3ce7e5999eda1271da50d9908 Mon Sep 17 00:00:00 2001 From: leipeng Date: Sat, 23 Dec 2023 16:42:44 +0800 Subject: [PATCH 18/18] MemTableList::PickMemtablesToFlush: do not allow merge memtable if SupportConvertToSST --- db/memtable_list.cc | 3 +++ 1 file changed, 3 insertions(+) diff --git a/db/memtable_list.cc b/db/memtable_list.cc index 3a7e15655..5e3a69559 100644 --- a/db/memtable_list.cc +++ b/db/memtable_list.cc @@ -422,6 +422,9 @@ void MemTableList::PickMemtablesToFlush(uint64_t max_memtable_id, if (!m->GetImmutableMemTableOptions()->allow_merge_memtables) { break; } + if (m->SupportConvertToSST()) { + break; + } } else if (!ret->empty()) { // This `break` is necessary to prevent picking non-consecutive memtables // in case `memlist` has one or more entries with