diff --git a/nano/core_test/toml.cpp b/nano/core_test/toml.cpp index ce861da1d8..80fd6c24a9 100644 --- a/nano/core_test/toml.cpp +++ b/nano/core_test/toml.cpp @@ -239,8 +239,9 @@ TEST (toml, daemon_config_deserialize_defaults) ASSERT_EQ (conf.node.lmdb_config.map_size, defaults.node.lmdb_config.map_size); ASSERT_EQ (conf.node.rocksdb_config.enable, defaults.node.rocksdb_config.enable); - ASSERT_EQ (conf.node.rocksdb_config.memory_multiplier, defaults.node.rocksdb_config.memory_multiplier); ASSERT_EQ (conf.node.rocksdb_config.io_threads, defaults.node.rocksdb_config.io_threads); + ASSERT_EQ (conf.node.rocksdb_config.read_cache, defaults.node.rocksdb_config.read_cache); + ASSERT_EQ (conf.node.rocksdb_config.write_cache, defaults.node.rocksdb_config.write_cache); ASSERT_EQ (conf.node.optimistic_scheduler.enabled, defaults.node.optimistic_scheduler.enabled); ASSERT_EQ (conf.node.optimistic_scheduler.gap_threshold, defaults.node.optimistic_scheduler.gap_threshold); @@ -573,8 +574,9 @@ TEST (toml, daemon_config_deserialize_no_defaults) [node.rocksdb] enable = true - memory_multiplier = 3 io_threads = 99 + read_cache = 99 + write_cache = 99 [node.experimental] secondary_work_peers = ["dev.org:998"] @@ -743,8 +745,9 @@ TEST (toml, daemon_config_deserialize_no_defaults) ASSERT_TRUE (conf.node.rocksdb_config.enable); ASSERT_EQ (nano::rocksdb_config::using_rocksdb_in_tests (), defaults.node.rocksdb_config.enable); - ASSERT_NE (conf.node.rocksdb_config.memory_multiplier, defaults.node.rocksdb_config.memory_multiplier); ASSERT_NE (conf.node.rocksdb_config.io_threads, defaults.node.rocksdb_config.io_threads); + ASSERT_NE (conf.node.rocksdb_config.read_cache, defaults.node.rocksdb_config.read_cache); + ASSERT_NE (conf.node.rocksdb_config.write_cache, defaults.node.rocksdb_config.write_cache); ASSERT_NE (conf.node.optimistic_scheduler.enabled, defaults.node.optimistic_scheduler.enabled); ASSERT_NE (conf.node.optimistic_scheduler.gap_threshold, defaults.node.optimistic_scheduler.gap_threshold); diff --git a/nano/lib/rocksdbconfig.cpp b/nano/lib/rocksdbconfig.cpp index a12605d1f7..100af267fd 100644 --- a/nano/lib/rocksdbconfig.cpp +++ b/nano/lib/rocksdbconfig.cpp @@ -5,25 +5,34 @@ nano::error nano::rocksdb_config::serialize_toml (nano::tomlconfig & toml) const { toml.put ("enable", enable, "Whether to use the RocksDB backend for the ledger database.\ntype:bool"); - toml.put ("memory_multiplier", memory_multiplier, "This will modify how much memory is used represented by 1 (low), 2 (medium), 3 (high). Default is 2.\ntype:uint8"); toml.put ("io_threads", io_threads, "Number of threads to use with the background compaction and flushing.\ntype:uint32"); + toml.put ("read_cache", read_cache, "Amount of megabytes per table allocated to read cache. Valid range is 1 - 1024. Default is 32.\nCarefully monitor memory usage if non-default values are used\ntype:long"); + toml.put ("write_cache", write_cache, "Total amount of megabytes allocated to write cache. Valid range is 1 - 256. Default is 64.\nCarefully monitor memory usage if non-default values are used\ntype:long"); + return toml.get_error (); } nano::error nano::rocksdb_config::deserialize_toml (nano::tomlconfig & toml) { toml.get_optional ("enable", enable); - toml.get_optional ("memory_multiplier", memory_multiplier); toml.get_optional ("io_threads", io_threads); + toml.get_optional ("read_cache", read_cache); + toml.get_optional ("write_cache", write_cache); // Validate ranges if (io_threads == 0) { toml.get_error ().set ("io_threads must be non-zero"); } - if (memory_multiplier < 1 || memory_multiplier > 3) + + if (read_cache < 1 || read_cache > 1024) + { + toml.get_error ().set ("read_cache must be between 1 and 1024 MB"); + } + + if (write_cache < 1 || write_cache > 256) { - toml.get_error ().set ("memory_multiplier must be either 1, 2 or 3"); + toml.get_error ().set ("write_cache must be between 1 and 256 MB"); } return toml.get_error (); diff --git a/nano/lib/rocksdbconfig.hpp b/nano/lib/rocksdbconfig.hpp index 232d320193..fabc1fb0a1 100644 --- a/nano/lib/rocksdbconfig.hpp +++ b/nano/lib/rocksdbconfig.hpp @@ -25,7 +25,8 @@ class rocksdb_config final static bool using_rocksdb_in_tests (); bool enable{ false }; - uint8_t memory_multiplier{ 2 }; unsigned io_threads{ std::max (nano::hardware_concurrency () / 2, 1u) }; + long read_cache{ 32 }; + long write_cache{ 64 }; }; } diff --git a/nano/store/rocksdb/rocksdb.cpp b/nano/store/rocksdb/rocksdb.cpp index 60f7e95cee..439cb92db7 100644 --- a/nano/store/rocksdb/rocksdb.cpp +++ b/nano/store/rocksdb/rocksdb.cpp @@ -64,7 +64,7 @@ nano::store::rocksdb::component::component (nano::logger & logger_a, std::filesy logger{ logger_a }, constants{ constants }, rocksdb_config{ rocksdb_config_a }, - max_block_write_batch_num_m{ nano::narrow_cast (blocks_memtable_size_bytes () / (2 * (sizeof (nano::block_type) + nano::state_block::size + nano::block_sideband::size (nano::block_type::state)))) }, + max_block_write_batch_num_m{ nano::narrow_cast ((rocksdb_config_a.write_cache * 1024 * 1024) / (2 * (sizeof (nano::block_type) + nano::state_block::size + nano::block_sideband::size (nano::block_type::state)))) }, cf_name_table_map{ create_cf_name_table_map () } { boost::system::error_code error_mkdir, error_chmod; @@ -80,7 +80,6 @@ nano::store::rocksdb::component::component (nano::logger & logger_a, std::filesy debug_assert (path_a.filename () == "rocksdb"); generate_tombstone_map (); - small_table_factory.reset (::rocksdb::NewBlockBasedTableFactory (get_small_table_options ())); // TODO: get_db_options () registers a listener for resetting tombstones, needs to check if it is a problem calling it more than once. auto options = get_db_options (); @@ -400,120 +399,16 @@ void nano::store::rocksdb::component::generate_tombstone_map () tombstone_map.emplace (std::piecewise_construct, std::forward_as_tuple (nano::tables::pending), std::forward_as_tuple (0, 25000)); } -rocksdb::ColumnFamilyOptions nano::store::rocksdb::component::get_common_cf_options (std::shared_ptr<::rocksdb::TableFactory> const & table_factory_a, unsigned long long memtable_size_bytes_a) const -{ - ::rocksdb::ColumnFamilyOptions cf_options; - cf_options.table_factory = table_factory_a; - - // (1 active, 1 inactive) - auto num_memtables = 2; - - // Each level is a multiple of the above. If L1 is 512MB. L2 will be 512 * 8 = 2GB. L3 will be 2GB * 8 = 16GB, and so on... - cf_options.max_bytes_for_level_multiplier = 8; - - // Although this should be the default provided by RocksDB, not setting this is causing sequence conflict checks if not using - cf_options.max_write_buffer_size_to_maintain = memtable_size_bytes_a * num_memtables; - - // Files older than this (1 day) will be scheduled for compaction when there is no other background work. This can lead to more writes however. - cf_options.ttl = 1 * 24 * 60 * 60; - - // Multiplier for each level - cf_options.target_file_size_multiplier = 10; - - // Size of level 1 sst files - cf_options.target_file_size_base = memtable_size_bytes_a; - - // Size of each memtable - cf_options.write_buffer_size = memtable_size_bytes_a; - - // Number of memtables to keep in memory - cf_options.max_write_buffer_number = num_memtables; - - return cf_options; -} - rocksdb::ColumnFamilyOptions nano::store::rocksdb::component::get_cf_options (std::string const & cf_name_a) const { ::rocksdb::ColumnFamilyOptions cf_options; - auto const memtable_size_bytes = base_memtable_size_bytes (); - auto const block_cache_size_bytes = 1024ULL * 1024 * rocksdb_config.memory_multiplier * base_block_cache_size; - if (cf_name_a == "blocks") - { - std::shared_ptr<::rocksdb::TableFactory> table_factory (::rocksdb::NewBlockBasedTableFactory (get_active_table_options (block_cache_size_bytes * 4))); - cf_options = get_active_cf_options (table_factory, blocks_memtable_size_bytes ()); - } - else if (cf_name_a == "confirmation_height") - { - // Entries will not be deleted in the normal case, so can make memtables a lot bigger - std::shared_ptr<::rocksdb::TableFactory> table_factory (::rocksdb::NewBlockBasedTableFactory (get_active_table_options (block_cache_size_bytes))); - cf_options = get_active_cf_options (table_factory, memtable_size_bytes * 2); - } - else if (cf_name_a == "meta" || cf_name_a == "online_weight" || cf_name_a == "peers") - { - // Meta - It contains just version key - // Online weight - Periodically deleted - // Peers - Cleaned periodically, a lot of deletions. This is never read outside of initializing? Keep this small - cf_options = get_small_cf_options (small_table_factory); - } - else if (cf_name_a == "cached_counts") - { - // Really small (keys are blocks tables, value is uint64_t) - cf_options = get_small_cf_options (small_table_factory); - } - else if (cf_name_a == "pending") - { - // Pending can have a lot of deletions too - std::shared_ptr<::rocksdb::TableFactory> table_factory (::rocksdb::NewBlockBasedTableFactory (get_active_table_options (block_cache_size_bytes))); - cf_options = get_active_cf_options (table_factory, memtable_size_bytes); - - // Number of files in level 0 which triggers compaction. Size of L0 and L1 should be kept similar as this is the only compaction which is single threaded - cf_options.level0_file_num_compaction_trigger = 2; - - // L1 size, compaction is triggered for L0 at this size (2 SST files in L1) - cf_options.max_bytes_for_level_base = memtable_size_bytes * 2; - } - else if (cf_name_a == "frontiers") - { - // Frontiers is only needed during bootstrap for legacy blocks - std::shared_ptr<::rocksdb::TableFactory> table_factory (::rocksdb::NewBlockBasedTableFactory (get_active_table_options (block_cache_size_bytes))); - cf_options = get_active_cf_options (table_factory, memtable_size_bytes); - } - else if (cf_name_a == "accounts") - { - // Can have deletions from rollbacks - std::shared_ptr<::rocksdb::TableFactory> table_factory (::rocksdb::NewBlockBasedTableFactory (get_active_table_options (block_cache_size_bytes * 2))); - cf_options = get_active_cf_options (table_factory, memtable_size_bytes); - } - else if (cf_name_a == "vote") - { - // No deletes it seems, only overwrites. - std::shared_ptr<::rocksdb::TableFactory> table_factory (::rocksdb::NewBlockBasedTableFactory (get_active_table_options (block_cache_size_bytes * 2))); - cf_options = get_active_cf_options (table_factory, memtable_size_bytes); - } - else if (cf_name_a == "pruned") - { - std::shared_ptr<::rocksdb::TableFactory> table_factory (::rocksdb::NewBlockBasedTableFactory (get_active_table_options (block_cache_size_bytes * 2))); - cf_options = get_active_cf_options (table_factory, memtable_size_bytes); - } - else if (cf_name_a == "final_votes") - { - std::shared_ptr<::rocksdb::TableFactory> table_factory (::rocksdb::NewBlockBasedTableFactory (get_active_table_options (block_cache_size_bytes * 2))); - cf_options = get_active_cf_options (table_factory, memtable_size_bytes); - } - else if (cf_name_a == "rep_weights") - { - std::shared_ptr<::rocksdb::TableFactory> table_factory (::rocksdb::NewBlockBasedTableFactory (get_active_table_options (block_cache_size_bytes * 2))); - cf_options = get_active_cf_options (table_factory, memtable_size_bytes); - } - else if (cf_name_a == ::rocksdb::kDefaultColumnFamilyName) - { - // Do nothing. - } - else + if (cf_name_a != ::rocksdb::kDefaultColumnFamilyName) { - debug_assert (false); + std::shared_ptr<::rocksdb::TableFactory> table_factory (::rocksdb::NewBlockBasedTableFactory (get_table_options ())); + cf_options.table_factory = table_factory; + // Size of each memtable (write buffer for this column family) + cf_options.write_buffer_size = rocksdb_config.write_cache * 1024 * 1024; } - return cf_options; } @@ -863,30 +758,11 @@ rocksdb::Options nano::store::rocksdb::component::get_db_options () db_options.create_if_missing = true; db_options.create_missing_column_families = true; - // TODO: review if this should be changed due to the unchecked table removal. - // Enable whole key bloom filter in memtables for ones with memtable_prefix_bloom_size_ratio set (unchecked table currently). - // It can potentially reduce CPU usage for point-look-ups. - db_options.memtable_whole_key_filtering = true; - - // Sets the compaction priority - db_options.compaction_pri = ::rocksdb::CompactionPri::kMinOverlappingRatio; - - // Start aggressively flushing WAL files when they reach over 1GB - db_options.max_total_wal_size = 1 * 1024 * 1024 * 1024LL; - // Optimize RocksDB. This is the easiest way to get RocksDB to perform well - db_options.IncreaseParallelism (rocksdb_config.io_threads); db_options.OptimizeLevelStyleCompaction (); - // Adds a separate write queue for memtable/WAL - db_options.enable_pipelined_write = true; - - // Default is 16, setting to -1 allows faster startup times for SSDs by allowings more files to be read in parallel. - db_options.max_file_opening_threads = -1; - - // The MANIFEST file contains a history of all file operations since the last time the DB was opened and is replayed during DB open. - // Default is 1GB, lowering this to avoid replaying for too long (100MB) - db_options.max_manifest_file_size = 100 * 1024 * 1024ULL; + // Set max number of threads + db_options.IncreaseParallelism (rocksdb_config.io_threads); // Not compressing any SST files for compatibility reasons. db_options.compression = ::rocksdb::kNoCompression; @@ -899,75 +775,27 @@ rocksdb::Options nano::store::rocksdb::component::get_db_options () return db_options; } -rocksdb::BlockBasedTableOptions nano::store::rocksdb::component::get_active_table_options (std::size_t lru_size) const +rocksdb::BlockBasedTableOptions nano::store::rocksdb::component::get_table_options () const { ::rocksdb::BlockBasedTableOptions table_options; // Improve point lookup performance be using the data block hash index (uses about 5% more space). table_options.data_block_index_type = ::rocksdb::BlockBasedTableOptions::DataBlockIndexType::kDataBlockBinaryAndHash; - table_options.data_block_hash_table_util_ratio = 0.75; - // Using format_version=4 significantly reduces the index block size, in some cases around 4-5x. - // This frees more space in block cache, which would result in higher hit rate for data and filter blocks, - // or offer the same performance with a smaller block cache size. - table_options.format_version = 4; - table_options.index_block_restart_interval = 16; + // Using storage format_version 5. + // Version 5 offers improved read spead, caching and better compression (if enabled) + // Any existing ledger data in version 4 will not be migrated. New data will be written in version 5. + table_options.format_version = 5; // Block cache for reads - table_options.block_cache = ::rocksdb::NewLRUCache (lru_size); + table_options.block_cache = ::rocksdb::NewLRUCache (rocksdb_config.read_cache * 1024 * 1024); // Bloom filter to help with point reads. 10bits gives 1% false positive rate. table_options.filter_policy.reset (::rocksdb::NewBloomFilterPolicy (10, false)); - // Increasing block_size decreases memory usage and space amplification, but increases read amplification. - table_options.block_size = 16 * 1024ULL; - - // Whether level 0 index and filter blocks are stored in block_cache - table_options.pin_l0_filter_and_index_blocks_in_cache = true; - - return table_options; -} - -rocksdb::BlockBasedTableOptions nano::store::rocksdb::component::get_small_table_options () const -{ - ::rocksdb::BlockBasedTableOptions table_options; - // Improve point lookup performance be using the data block hash index (uses about 5% more space). - table_options.data_block_index_type = ::rocksdb::BlockBasedTableOptions::DataBlockIndexType::kDataBlockBinaryAndHash; - table_options.data_block_hash_table_util_ratio = 0.75; - table_options.block_size = 1024ULL; return table_options; } -rocksdb::ColumnFamilyOptions nano::store::rocksdb::component::get_small_cf_options (std::shared_ptr<::rocksdb::TableFactory> const & table_factory_a) const -{ - auto const memtable_size_bytes = 10000; - auto cf_options = get_common_cf_options (table_factory_a, memtable_size_bytes); - - // Number of files in level 0 which triggers compaction. Size of L0 and L1 should be kept similar as this is the only compaction which is single threaded - cf_options.level0_file_num_compaction_trigger = 1; - - // L1 size, compaction is triggered for L0 at this size (1 SST file in L1) - cf_options.max_bytes_for_level_base = memtable_size_bytes; - - return cf_options; -} - -::rocksdb::ColumnFamilyOptions nano::store::rocksdb::component::get_active_cf_options (std::shared_ptr<::rocksdb::TableFactory> const & table_factory_a, unsigned long long memtable_size_bytes_a) const -{ - auto cf_options = get_common_cf_options (table_factory_a, memtable_size_bytes_a); - - // Number of files in level 0 which triggers compaction. Size of L0 and L1 should be kept similar as this is the only compaction which is single threaded - cf_options.level0_file_num_compaction_trigger = 4; - - // L1 size, compaction is triggered for L0 at this size (4 SST files in L1) - cf_options.max_bytes_for_level_base = memtable_size_bytes_a * 4; - - // Size target of levels are changed dynamically based on size of the last level - cf_options.level_compaction_dynamic_level_bytes = true; - - return cf_options; -} - void nano::store::rocksdb::component::on_flush (::rocksdb::FlushJobInfo const & flush_job_info_a) { // Reset appropriate tombstone counters @@ -1109,16 +937,6 @@ void nano::store::rocksdb::component::serialize_memory_stats (boost::property_tr json.put ("block-cache-usage", val); } -unsigned long long nano::store::rocksdb::component::blocks_memtable_size_bytes () const -{ - return base_memtable_size_bytes (); -} - -unsigned long long nano::store::rocksdb::component::base_memtable_size_bytes () const -{ - return 1024ULL * 1024 * rocksdb_config.memory_multiplier * base_memtable_size; -} - // This is a ratio of the blocks memtable size to keep total write transaction commit size down. unsigned nano::store::rocksdb::component::max_block_write_batch_num () const { diff --git a/nano/store/rocksdb/rocksdb.hpp b/nano/store/rocksdb/rocksdb.hpp index 5d8b22bba7..e106931c3d 100644 --- a/nano/store/rocksdb/rocksdb.hpp +++ b/nano/store/rocksdb/rocksdb.hpp @@ -108,7 +108,6 @@ class component : public nano::store::component ::rocksdb::TransactionDB * transaction_db = nullptr; std::unique_ptr<::rocksdb::DB> db; std::vector> handles; - std::shared_ptr<::rocksdb::TableFactory> small_table_factory; std::unordered_map write_lock_mutexes; nano::rocksdb_config rocksdb_config; unsigned const max_block_write_batch_num_m; @@ -155,11 +154,7 @@ class component : public nano::store::component void construct_column_family_mutexes (); ::rocksdb::Options get_db_options (); - ::rocksdb::ColumnFamilyOptions get_common_cf_options (std::shared_ptr<::rocksdb::TableFactory> const & table_factory_a, unsigned long long memtable_size_bytes_a) const; - ::rocksdb::ColumnFamilyOptions get_active_cf_options (std::shared_ptr<::rocksdb::TableFactory> const & table_factory_a, unsigned long long memtable_size_bytes_a) const; - ::rocksdb::ColumnFamilyOptions get_small_cf_options (std::shared_ptr<::rocksdb::TableFactory> const & table_factory_a) const; - ::rocksdb::BlockBasedTableOptions get_active_table_options (std::size_t lru_size) const; - ::rocksdb::BlockBasedTableOptions get_small_table_options () const; + ::rocksdb::BlockBasedTableOptions get_table_options () const; ::rocksdb::ColumnFamilyOptions get_cf_options (std::string const & cf_name_a) const; void on_flush (::rocksdb::FlushJobInfo const &); @@ -169,11 +164,6 @@ class component : public nano::store::component std::unordered_map create_cf_name_table_map () const; std::vector<::rocksdb::ColumnFamilyDescriptor> create_column_families (); - unsigned long long base_memtable_size_bytes () const; - unsigned long long blocks_memtable_size_bytes () const; - - constexpr static int base_memtable_size = 16; - constexpr static int base_block_cache_size = 8; friend class nano::rocksdb_block_store_tombstone_count_Test; friend class rocksdb_block_store_upgrade_v21_v22_Test;