Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Rocksdb overhaul and clean up #4730

Merged
merged 9 commits into from
Sep 25, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 6 additions & 3 deletions nano/core_test/toml.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -239,8 +239,9 @@ TEST (toml, daemon_config_deserialize_defaults)
ASSERT_EQ (conf.node.lmdb_config.map_size, defaults.node.lmdb_config.map_size);

ASSERT_EQ (conf.node.rocksdb_config.enable, defaults.node.rocksdb_config.enable);
ASSERT_EQ (conf.node.rocksdb_config.memory_multiplier, defaults.node.rocksdb_config.memory_multiplier);
ASSERT_EQ (conf.node.rocksdb_config.io_threads, defaults.node.rocksdb_config.io_threads);
ASSERT_EQ (conf.node.rocksdb_config.read_cache, defaults.node.rocksdb_config.read_cache);
ASSERT_EQ (conf.node.rocksdb_config.write_cache, defaults.node.rocksdb_config.write_cache);

ASSERT_EQ (conf.node.optimistic_scheduler.enabled, defaults.node.optimistic_scheduler.enabled);
ASSERT_EQ (conf.node.optimistic_scheduler.gap_threshold, defaults.node.optimistic_scheduler.gap_threshold);
Expand Down Expand Up @@ -573,8 +574,9 @@ TEST (toml, daemon_config_deserialize_no_defaults)

[node.rocksdb]
enable = true
memory_multiplier = 3
io_threads = 99
read_cache = 99
write_cache = 99

[node.experimental]
secondary_work_peers = ["dev.org:998"]
Expand Down Expand Up @@ -743,8 +745,9 @@ TEST (toml, daemon_config_deserialize_no_defaults)

ASSERT_TRUE (conf.node.rocksdb_config.enable);
ASSERT_EQ (nano::rocksdb_config::using_rocksdb_in_tests (), defaults.node.rocksdb_config.enable);
ASSERT_NE (conf.node.rocksdb_config.memory_multiplier, defaults.node.rocksdb_config.memory_multiplier);
ASSERT_NE (conf.node.rocksdb_config.io_threads, defaults.node.rocksdb_config.io_threads);
ASSERT_NE (conf.node.rocksdb_config.read_cache, defaults.node.rocksdb_config.read_cache);
ASSERT_NE (conf.node.rocksdb_config.write_cache, defaults.node.rocksdb_config.write_cache);

ASSERT_NE (conf.node.optimistic_scheduler.enabled, defaults.node.optimistic_scheduler.enabled);
ASSERT_NE (conf.node.optimistic_scheduler.gap_threshold, defaults.node.optimistic_scheduler.gap_threshold);
Expand Down
17 changes: 13 additions & 4 deletions nano/lib/rocksdbconfig.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,25 +5,34 @@
nano::error nano::rocksdb_config::serialize_toml (nano::tomlconfig & toml) const
{
toml.put ("enable", enable, "Whether to use the RocksDB backend for the ledger database.\ntype:bool");
toml.put ("memory_multiplier", memory_multiplier, "This will modify how much memory is used represented by 1 (low), 2 (medium), 3 (high). Default is 2.\ntype:uint8");
toml.put ("io_threads", io_threads, "Number of threads to use with the background compaction and flushing.\ntype:uint32");
toml.put ("read_cache", read_cache, "Amount of megabytes per table allocated to read cache. Valid range is 1 - 1024. Default is 32.\nCarefully monitor memory usage if non-default values are used\ntype:long");
toml.put ("write_cache", write_cache, "Total amount of megabytes allocated to write cache. Valid range is 1 - 256. Default is 64.\nCarefully monitor memory usage if non-default values are used\ntype:long");

return toml.get_error ();
}

nano::error nano::rocksdb_config::deserialize_toml (nano::tomlconfig & toml)
{
toml.get_optional<bool> ("enable", enable);
toml.get_optional<uint8_t> ("memory_multiplier", memory_multiplier);
toml.get_optional<unsigned> ("io_threads", io_threads);
toml.get_optional<long> ("read_cache", read_cache);
toml.get_optional<long> ("write_cache", write_cache);

// Validate ranges
if (io_threads == 0)
{
toml.get_error ().set ("io_threads must be non-zero");
}
if (memory_multiplier < 1 || memory_multiplier > 3)

if (read_cache < 1 || read_cache > 1024)
{
toml.get_error ().set ("read_cache must be between 1 and 1024 MB");
}

if (write_cache < 1 || write_cache > 256)
{
toml.get_error ().set ("memory_multiplier must be either 1, 2 or 3");
toml.get_error ().set ("write_cache must be between 1 and 256 MB");
}

return toml.get_error ();
Expand Down
3 changes: 2 additions & 1 deletion nano/lib/rocksdbconfig.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,8 @@ class rocksdb_config final
static bool using_rocksdb_in_tests ();

bool enable{ false };
uint8_t memory_multiplier{ 2 };
unsigned io_threads{ std::max (nano::hardware_concurrency () / 2, 1u) };
long read_cache{ 32 };
long write_cache{ 64 };
};
}
210 changes: 14 additions & 196 deletions nano/store/rocksdb/rocksdb.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ nano::store::rocksdb::component::component (nano::logger & logger_a, std::filesy
logger{ logger_a },
constants{ constants },
rocksdb_config{ rocksdb_config_a },
max_block_write_batch_num_m{ nano::narrow_cast<unsigned> (blocks_memtable_size_bytes () / (2 * (sizeof (nano::block_type) + nano::state_block::size + nano::block_sideband::size (nano::block_type::state)))) },
max_block_write_batch_num_m{ nano::narrow_cast<unsigned> ((rocksdb_config_a.write_cache * 1024 * 1024) / (2 * (sizeof (nano::block_type) + nano::state_block::size + nano::block_sideband::size (nano::block_type::state)))) },
cf_name_table_map{ create_cf_name_table_map () }
{
boost::system::error_code error_mkdir, error_chmod;
Expand All @@ -80,7 +80,6 @@ nano::store::rocksdb::component::component (nano::logger & logger_a, std::filesy
debug_assert (path_a.filename () == "rocksdb");

generate_tombstone_map ();
small_table_factory.reset (::rocksdb::NewBlockBasedTableFactory (get_small_table_options ()));

// TODO: get_db_options () registers a listener for resetting tombstones, needs to check if it is a problem calling it more than once.
auto options = get_db_options ();
Expand Down Expand Up @@ -400,120 +399,16 @@ void nano::store::rocksdb::component::generate_tombstone_map ()
tombstone_map.emplace (std::piecewise_construct, std::forward_as_tuple (nano::tables::pending), std::forward_as_tuple (0, 25000));
}

rocksdb::ColumnFamilyOptions nano::store::rocksdb::component::get_common_cf_options (std::shared_ptr<::rocksdb::TableFactory> const & table_factory_a, unsigned long long memtable_size_bytes_a) const
{
::rocksdb::ColumnFamilyOptions cf_options;
cf_options.table_factory = table_factory_a;

// (1 active, 1 inactive)
auto num_memtables = 2;

// Each level is a multiple of the above. If L1 is 512MB. L2 will be 512 * 8 = 2GB. L3 will be 2GB * 8 = 16GB, and so on...
cf_options.max_bytes_for_level_multiplier = 8;

// Although this should be the default provided by RocksDB, not setting this is causing sequence conflict checks if not using
cf_options.max_write_buffer_size_to_maintain = memtable_size_bytes_a * num_memtables;

// Files older than this (1 day) will be scheduled for compaction when there is no other background work. This can lead to more writes however.
cf_options.ttl = 1 * 24 * 60 * 60;

// Multiplier for each level
cf_options.target_file_size_multiplier = 10;

// Size of level 1 sst files
cf_options.target_file_size_base = memtable_size_bytes_a;

// Size of each memtable
cf_options.write_buffer_size = memtable_size_bytes_a;

// Number of memtables to keep in memory
cf_options.max_write_buffer_number = num_memtables;

return cf_options;
}

rocksdb::ColumnFamilyOptions nano::store::rocksdb::component::get_cf_options (std::string const & cf_name_a) const
{
::rocksdb::ColumnFamilyOptions cf_options;
auto const memtable_size_bytes = base_memtable_size_bytes ();
auto const block_cache_size_bytes = 1024ULL * 1024 * rocksdb_config.memory_multiplier * base_block_cache_size;
if (cf_name_a == "blocks")
{
std::shared_ptr<::rocksdb::TableFactory> table_factory (::rocksdb::NewBlockBasedTableFactory (get_active_table_options (block_cache_size_bytes * 4)));
cf_options = get_active_cf_options (table_factory, blocks_memtable_size_bytes ());
}
else if (cf_name_a == "confirmation_height")
{
// Entries will not be deleted in the normal case, so can make memtables a lot bigger
std::shared_ptr<::rocksdb::TableFactory> table_factory (::rocksdb::NewBlockBasedTableFactory (get_active_table_options (block_cache_size_bytes)));
cf_options = get_active_cf_options (table_factory, memtable_size_bytes * 2);
}
else if (cf_name_a == "meta" || cf_name_a == "online_weight" || cf_name_a == "peers")
{
// Meta - It contains just version key
// Online weight - Periodically deleted
// Peers - Cleaned periodically, a lot of deletions. This is never read outside of initializing? Keep this small
cf_options = get_small_cf_options (small_table_factory);
}
else if (cf_name_a == "cached_counts")
{
// Really small (keys are blocks tables, value is uint64_t)
cf_options = get_small_cf_options (small_table_factory);
}
else if (cf_name_a == "pending")
{
// Pending can have a lot of deletions too
std::shared_ptr<::rocksdb::TableFactory> table_factory (::rocksdb::NewBlockBasedTableFactory (get_active_table_options (block_cache_size_bytes)));
cf_options = get_active_cf_options (table_factory, memtable_size_bytes);

// Number of files in level 0 which triggers compaction. Size of L0 and L1 should be kept similar as this is the only compaction which is single threaded
cf_options.level0_file_num_compaction_trigger = 2;

// L1 size, compaction is triggered for L0 at this size (2 SST files in L1)
cf_options.max_bytes_for_level_base = memtable_size_bytes * 2;
}
else if (cf_name_a == "frontiers")
{
// Frontiers is only needed during bootstrap for legacy blocks
std::shared_ptr<::rocksdb::TableFactory> table_factory (::rocksdb::NewBlockBasedTableFactory (get_active_table_options (block_cache_size_bytes)));
cf_options = get_active_cf_options (table_factory, memtable_size_bytes);
}
else if (cf_name_a == "accounts")
{
// Can have deletions from rollbacks
std::shared_ptr<::rocksdb::TableFactory> table_factory (::rocksdb::NewBlockBasedTableFactory (get_active_table_options (block_cache_size_bytes * 2)));
cf_options = get_active_cf_options (table_factory, memtable_size_bytes);
}
else if (cf_name_a == "vote")
{
// No deletes it seems, only overwrites.
std::shared_ptr<::rocksdb::TableFactory> table_factory (::rocksdb::NewBlockBasedTableFactory (get_active_table_options (block_cache_size_bytes * 2)));
cf_options = get_active_cf_options (table_factory, memtable_size_bytes);
}
else if (cf_name_a == "pruned")
{
std::shared_ptr<::rocksdb::TableFactory> table_factory (::rocksdb::NewBlockBasedTableFactory (get_active_table_options (block_cache_size_bytes * 2)));
cf_options = get_active_cf_options (table_factory, memtable_size_bytes);
}
else if (cf_name_a == "final_votes")
{
std::shared_ptr<::rocksdb::TableFactory> table_factory (::rocksdb::NewBlockBasedTableFactory (get_active_table_options (block_cache_size_bytes * 2)));
cf_options = get_active_cf_options (table_factory, memtable_size_bytes);
}
else if (cf_name_a == "rep_weights")
{
std::shared_ptr<::rocksdb::TableFactory> table_factory (::rocksdb::NewBlockBasedTableFactory (get_active_table_options (block_cache_size_bytes * 2)));
cf_options = get_active_cf_options (table_factory, memtable_size_bytes);
}
else if (cf_name_a == ::rocksdb::kDefaultColumnFamilyName)
{
// Do nothing.
}
else
if (cf_name_a != ::rocksdb::kDefaultColumnFamilyName)
{
debug_assert (false);
std::shared_ptr<::rocksdb::TableFactory> table_factory (::rocksdb::NewBlockBasedTableFactory (get_table_options ()));
cf_options.table_factory = table_factory;
// Size of each memtable (write buffer for this column family)
cf_options.write_buffer_size = rocksdb_config.write_cache * 1024 * 1024;
}

return cf_options;
}

Expand Down Expand Up @@ -863,30 +758,11 @@ rocksdb::Options nano::store::rocksdb::component::get_db_options ()
db_options.create_if_missing = true;
db_options.create_missing_column_families = true;

// TODO: review if this should be changed due to the unchecked table removal.
// Enable whole key bloom filter in memtables for ones with memtable_prefix_bloom_size_ratio set (unchecked table currently).
// It can potentially reduce CPU usage for point-look-ups.
db_options.memtable_whole_key_filtering = true;

// Sets the compaction priority
db_options.compaction_pri = ::rocksdb::CompactionPri::kMinOverlappingRatio;

// Start aggressively flushing WAL files when they reach over 1GB
db_options.max_total_wal_size = 1 * 1024 * 1024 * 1024LL;

// Optimize RocksDB. This is the easiest way to get RocksDB to perform well
db_options.IncreaseParallelism (rocksdb_config.io_threads);
db_options.OptimizeLevelStyleCompaction ();

// Adds a separate write queue for memtable/WAL
db_options.enable_pipelined_write = true;

// Default is 16, setting to -1 allows faster startup times for SSDs by allowings more files to be read in parallel.
db_options.max_file_opening_threads = -1;

// The MANIFEST file contains a history of all file operations since the last time the DB was opened and is replayed during DB open.
// Default is 1GB, lowering this to avoid replaying for too long (100MB)
db_options.max_manifest_file_size = 100 * 1024 * 1024ULL;
// Set max number of threads
db_options.IncreaseParallelism (rocksdb_config.io_threads);

// Not compressing any SST files for compatibility reasons.
db_options.compression = ::rocksdb::kNoCompression;
Expand All @@ -899,75 +775,27 @@ rocksdb::Options nano::store::rocksdb::component::get_db_options ()
return db_options;
}

rocksdb::BlockBasedTableOptions nano::store::rocksdb::component::get_active_table_options (std::size_t lru_size) const
rocksdb::BlockBasedTableOptions nano::store::rocksdb::component::get_table_options () const
{
::rocksdb::BlockBasedTableOptions table_options;

// Improve point lookup performance be using the data block hash index (uses about 5% more space).
table_options.data_block_index_type = ::rocksdb::BlockBasedTableOptions::DataBlockIndexType::kDataBlockBinaryAndHash;
table_options.data_block_hash_table_util_ratio = 0.75;

// Using format_version=4 significantly reduces the index block size, in some cases around 4-5x.
// This frees more space in block cache, which would result in higher hit rate for data and filter blocks,
// or offer the same performance with a smaller block cache size.
table_options.format_version = 4;
table_options.index_block_restart_interval = 16;
// Using storage format_version 5.
// Version 5 offers improved read spead, caching and better compression (if enabled)
// Any existing ledger data in version 4 will not be migrated. New data will be written in version 5.
table_options.format_version = 5;

// Block cache for reads
table_options.block_cache = ::rocksdb::NewLRUCache (lru_size);
table_options.block_cache = ::rocksdb::NewLRUCache (rocksdb_config.read_cache * 1024 * 1024);

// Bloom filter to help with point reads. 10bits gives 1% false positive rate.
table_options.filter_policy.reset (::rocksdb::NewBloomFilterPolicy (10, false));

// Increasing block_size decreases memory usage and space amplification, but increases read amplification.
table_options.block_size = 16 * 1024ULL;

// Whether level 0 index and filter blocks are stored in block_cache
table_options.pin_l0_filter_and_index_blocks_in_cache = true;

return table_options;
}

rocksdb::BlockBasedTableOptions nano::store::rocksdb::component::get_small_table_options () const
{
::rocksdb::BlockBasedTableOptions table_options;
// Improve point lookup performance be using the data block hash index (uses about 5% more space).
table_options.data_block_index_type = ::rocksdb::BlockBasedTableOptions::DataBlockIndexType::kDataBlockBinaryAndHash;
table_options.data_block_hash_table_util_ratio = 0.75;
table_options.block_size = 1024ULL;
return table_options;
}

rocksdb::ColumnFamilyOptions nano::store::rocksdb::component::get_small_cf_options (std::shared_ptr<::rocksdb::TableFactory> const & table_factory_a) const
{
auto const memtable_size_bytes = 10000;
auto cf_options = get_common_cf_options (table_factory_a, memtable_size_bytes);

// Number of files in level 0 which triggers compaction. Size of L0 and L1 should be kept similar as this is the only compaction which is single threaded
cf_options.level0_file_num_compaction_trigger = 1;

// L1 size, compaction is triggered for L0 at this size (1 SST file in L1)
cf_options.max_bytes_for_level_base = memtable_size_bytes;

return cf_options;
}

::rocksdb::ColumnFamilyOptions nano::store::rocksdb::component::get_active_cf_options (std::shared_ptr<::rocksdb::TableFactory> const & table_factory_a, unsigned long long memtable_size_bytes_a) const
{
auto cf_options = get_common_cf_options (table_factory_a, memtable_size_bytes_a);

// Number of files in level 0 which triggers compaction. Size of L0 and L1 should be kept similar as this is the only compaction which is single threaded
cf_options.level0_file_num_compaction_trigger = 4;

// L1 size, compaction is triggered for L0 at this size (4 SST files in L1)
cf_options.max_bytes_for_level_base = memtable_size_bytes_a * 4;

// Size target of levels are changed dynamically based on size of the last level
cf_options.level_compaction_dynamic_level_bytes = true;

return cf_options;
}

void nano::store::rocksdb::component::on_flush (::rocksdb::FlushJobInfo const & flush_job_info_a)
{
// Reset appropriate tombstone counters
Expand Down Expand Up @@ -1109,16 +937,6 @@ void nano::store::rocksdb::component::serialize_memory_stats (boost::property_tr
json.put ("block-cache-usage", val);
}

unsigned long long nano::store::rocksdb::component::blocks_memtable_size_bytes () const
{
return base_memtable_size_bytes ();
}

unsigned long long nano::store::rocksdb::component::base_memtable_size_bytes () const
{
return 1024ULL * 1024 * rocksdb_config.memory_multiplier * base_memtable_size;
}

// This is a ratio of the blocks memtable size to keep total write transaction commit size down.
unsigned nano::store::rocksdb::component::max_block_write_batch_num () const
{
Expand Down
12 changes: 1 addition & 11 deletions nano/store/rocksdb/rocksdb.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,6 @@ class component : public nano::store::component
::rocksdb::TransactionDB * transaction_db = nullptr;
std::unique_ptr<::rocksdb::DB> db;
std::vector<std::unique_ptr<::rocksdb::ColumnFamilyHandle>> handles;
std::shared_ptr<::rocksdb::TableFactory> small_table_factory;
std::unordered_map<nano::tables, nano::mutex> write_lock_mutexes;
nano::rocksdb_config rocksdb_config;
unsigned const max_block_write_batch_num_m;
Expand Down Expand Up @@ -155,11 +154,7 @@ class component : public nano::store::component

void construct_column_family_mutexes ();
::rocksdb::Options get_db_options ();
::rocksdb::ColumnFamilyOptions get_common_cf_options (std::shared_ptr<::rocksdb::TableFactory> const & table_factory_a, unsigned long long memtable_size_bytes_a) const;
::rocksdb::ColumnFamilyOptions get_active_cf_options (std::shared_ptr<::rocksdb::TableFactory> const & table_factory_a, unsigned long long memtable_size_bytes_a) const;
::rocksdb::ColumnFamilyOptions get_small_cf_options (std::shared_ptr<::rocksdb::TableFactory> const & table_factory_a) const;
::rocksdb::BlockBasedTableOptions get_active_table_options (std::size_t lru_size) const;
::rocksdb::BlockBasedTableOptions get_small_table_options () const;
::rocksdb::BlockBasedTableOptions get_table_options () const;
::rocksdb::ColumnFamilyOptions get_cf_options (std::string const & cf_name_a) const;

void on_flush (::rocksdb::FlushJobInfo const &);
Expand All @@ -169,11 +164,6 @@ class component : public nano::store::component
std::unordered_map<char const *, nano::tables> create_cf_name_table_map () const;

std::vector<::rocksdb::ColumnFamilyDescriptor> create_column_families ();
unsigned long long base_memtable_size_bytes () const;
unsigned long long blocks_memtable_size_bytes () const;

constexpr static int base_memtable_size = 16;
constexpr static int base_block_cache_size = 8;

friend class nano::rocksdb_block_store_tombstone_count_Test;
friend class rocksdb_block_store_upgrade_v21_v22_Test;
Expand Down
Loading