From be5c5bba3db9c11691840552234eb0af56c95029 Mon Sep 17 00:00:00 2001 From: Nicolae Vartolomei Date: Tue, 5 Nov 2024 11:36:27 +0000 Subject: [PATCH 1/3] storage: define storage::log_manager_probe (cherry picked from commit 1cd18e8a8dd45f929cd716c989c7e3ed0bcb00ad) --- src/v/storage/BUILD | 15 +++++++++++++++ src/v/storage/CMakeLists.txt | 1 + src/v/storage/log_manager.cc | 9 ++++++++- src/v/storage/log_manager.h | 7 +++++++ src/v/storage/log_manager_probe.cc | 24 ++++++++++++++++++++++++ src/v/storage/log_manager_probe.h | 29 +++++++++++++++++++++++++++++ 6 files changed, 84 insertions(+), 1 deletion(-) create mode 100644 src/v/storage/log_manager_probe.cc create mode 100644 src/v/storage/log_manager_probe.h diff --git a/src/v/storage/BUILD b/src/v/storage/BUILD index db1142ec23324..44f3c6ac8ac57 100644 --- a/src/v/storage/BUILD +++ b/src/v/storage/BUILD @@ -160,6 +160,7 @@ redpanda_cc_library( include_prefix = "storage", visibility = ["//visibility:public"], deps = [ + ":log_manager_probe", ":parser_utils", ":record_batch_builder", "//src/v/base", @@ -229,3 +230,17 @@ redpanda_cc_library( "@seastar", ], ) + +redpanda_cc_library( + name = "log_manager_probe", + srcs = [ + "log_manager_probe.cc", + ], + hdrs = [ + "log_manager_probe.h", + ], + include_prefix = "storage", + deps = [ + "//src/v/config", + ], +) diff --git a/src/v/storage/CMakeLists.txt b/src/v/storage/CMakeLists.txt index 1c04c5040592b..b19ccd5ba35db 100644 --- a/src/v/storage/CMakeLists.txt +++ b/src/v/storage/CMakeLists.txt @@ -5,6 +5,7 @@ v_cc_library( segment_reader.cc segment_deduplication_utils.cc log_manager.cc + log_manager_probe.cc disk_log_impl.cc disk_log_appender.cc parser.cc diff --git a/src/v/storage/log_manager.cc b/src/v/storage/log_manager.cc index f4d8135477122..336e539d4aa15 100644 --- a/src/v/storage/log_manager.cc +++ b/src/v/storage/log_manager.cc @@ -27,6 +27,7 @@ #include "storage/key_offset_map.h" #include "storage/kvstore.h" #include "storage/log.h" +#include "storage/log_manager_probe.h" #include "storage/logger.h" #include "storage/segment.h" #include "storage/segment_appender.h" @@ -143,7 +144,8 @@ log_manager::log_manager( , _feature_table(feature_table) , _jitter(_config.compaction_interval()) , _trigger_gc_jitter(0s, 5s) - , _batch_cache(_config.reclaim_opts) { + , _batch_cache(_config.reclaim_opts) + , _probe(std::make_unique()) { _config.compaction_interval.watch([this]() { _jitter = simple_time_jitter{ _config.compaction_interval()}; @@ -151,6 +153,8 @@ log_manager::log_manager( }); } +log_manager::~log_manager() = default; + ss::future<> log_manager::clean_close(ss::shared_ptr log) { auto clean_segment = co_await log->close(); @@ -171,6 +175,7 @@ ss::future<> log_manager::clean_close(ss::shared_ptr log) { } ss::future<> log_manager::start() { + _probe->setup_metrics(); if (unlikely(config::shard_local_cfg() .log_disable_housekeeping_for_tests.value())) { co_return; @@ -195,6 +200,8 @@ ss::future<> log_manager::stop() { co_await _compaction_hash_key_map->initialize(0); _compaction_hash_key_map.reset(); } + + _probe->clear_metrics(); } /** diff --git a/src/v/storage/log_manager.h b/src/v/storage/log_manager.h index ac066b26243d9..d344356111f86 100644 --- a/src/v/storage/log_manager.h +++ b/src/v/storage/log_manager.h @@ -47,6 +47,8 @@ namespace storage { +class log_manager_probe; + namespace testing_details { class log_manager_accessor; }; @@ -170,6 +172,7 @@ class log_manager { kvstore& kvstore, storage_resources&, ss::sharded&) noexcept; + ~log_manager(); ss::future> manage( ntp_config, @@ -293,6 +296,10 @@ class log_manager { // Hash key-map to use across multiple compactions to reuse reserved memory // rather than reallocating repeatedly. std::unique_ptr _compaction_hash_key_map; + + // Metrics. + std::unique_ptr _probe; + ss::gate _gate; ss::abort_source _abort_source; diff --git a/src/v/storage/log_manager_probe.cc b/src/v/storage/log_manager_probe.cc new file mode 100644 index 0000000000000..e5a85ce81aab0 --- /dev/null +++ b/src/v/storage/log_manager_probe.cc @@ -0,0 +1,24 @@ +// Copyright 2024 Redpanda Data, Inc. +// +// Use of this software is governed by the Business Source License +// included in the file licenses/BSL.md +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0 + +#include "storage/log_manager_probe.h" + +#include "config/configuration.h" + +namespace storage { + +void log_manager_probe::setup_metrics() { + if (config::shard_local_cfg().disable_metrics()) { + return; + } +} + +void log_manager_probe::clear_metrics() { _metrics.clear(); } + +} // namespace storage diff --git a/src/v/storage/log_manager_probe.h b/src/v/storage/log_manager_probe.h new file mode 100644 index 0000000000000..d849bdfa9fe66 --- /dev/null +++ b/src/v/storage/log_manager_probe.h @@ -0,0 +1,29 @@ +// Copyright 2024 Redpanda Data, Inc. +// +// Use of this software is governed by the Business Source License +// included in the file licenses/BSL.md +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0 + +#pragma once + +namespace storage { + +/// Log manager per-shard storage probe. +class log_manager_probe { +public: + log_manager_probe() = default; + log_manager_probe(const log_manager_probe&) = delete; + log_manager_probe& operator=(const log_manager_probe&) = delete; + log_manager_probe(log_manager_probe&&) = delete; + log_manager_probe& operator=(log_manager_probe&&) = delete; + ~log_manager_probe() = default; + +public: + void setup_metrics(); + void clear_metrics(); +}; + +}; // namespace storage From fc11a3ba053c735b05d53c67ede9bd7a8d0c7e87 Mon Sep 17 00:00:00 2001 From: Nicolae Vartolomei Date: Tue, 5 Nov 2024 11:39:53 +0000 Subject: [PATCH 2/3] storage: implement housekeeping metrics This will help monitor housekeeping liveness. (cherry picked from commit 1d7d27be58c86db0c6fe59c8f5f84c1de3a2aae5) --- src/v/storage/BUILD | 2 ++ src/v/storage/log_manager.cc | 2 ++ src/v/storage/log_manager_probe.cc | 22 ++++++++++++++++++++++ src/v/storage/log_manager_probe.h | 14 ++++++++++++++ tests/rptest/tests/full_disk_test.py | 23 ++++++++++++++++++++++- 5 files changed, 62 insertions(+), 1 deletion(-) diff --git a/src/v/storage/BUILD b/src/v/storage/BUILD index 44f3c6ac8ac57..53a10678be931 100644 --- a/src/v/storage/BUILD +++ b/src/v/storage/BUILD @@ -242,5 +242,7 @@ redpanda_cc_library( include_prefix = "storage", deps = [ "//src/v/config", + "//src/v/metrics", + "@seastar", ], ) diff --git a/src/v/storage/log_manager.cc b/src/v/storage/log_manager.cc index 336e539d4aa15..011bf1d8cb9a0 100644 --- a/src/v/storage/log_manager.cc +++ b/src/v/storage/log_manager.cc @@ -286,6 +286,7 @@ log_manager::housekeeping_scan(model::timestamp collection_threshold) { _abort_source, std::move(ntp_sanitizer_cfg), _compaction_hash_key_map.get())); + _probe->housekeeping_log_processed(); // bail out of compaction early in order to get back to gc if (_gc_triggered) { @@ -365,6 +366,7 @@ ss::future<> log_manager::housekeeping_loop() { // it is expected that callers set the flag whenever they want the // next round of housekeeping to priortize gc. _gc_triggered = false; + _probe->urgent_gc_run(); /* * build a schedule of partitions to gc ordered by amount of diff --git a/src/v/storage/log_manager_probe.cc b/src/v/storage/log_manager_probe.cc index e5a85ce81aab0..bf2449e0149c2 100644 --- a/src/v/storage/log_manager_probe.cc +++ b/src/v/storage/log_manager_probe.cc @@ -10,6 +10,9 @@ #include "storage/log_manager_probe.h" #include "config/configuration.h" +#include "metrics/prometheus_sanitize.h" + +#include namespace storage { @@ -17,6 +20,25 @@ void log_manager_probe::setup_metrics() { if (config::shard_local_cfg().disable_metrics()) { return; } + + namespace sm = ss::metrics; + + auto group_name = prometheus_sanitize::metrics_name("storage:manager"); + + _metrics.add_group( + group_name, + { + sm::make_counter( + "urgent_gc_runs", + [this] { return _urgent_gc_runs; }, + sm::description("Number of urgent GC runs")), + sm::make_counter( + "housekeeping_log_processed", + [this] { return _housekeeping_log_processed; }, + sm::description("Number of logs processed by housekeeping")), + }, + {}, + {}); } void log_manager_probe::clear_metrics() { _metrics.clear(); } diff --git a/src/v/storage/log_manager_probe.h b/src/v/storage/log_manager_probe.h index d849bdfa9fe66..017f5060f2bba 100644 --- a/src/v/storage/log_manager_probe.h +++ b/src/v/storage/log_manager_probe.h @@ -9,6 +9,10 @@ #pragma once +#include "metrics/metrics.h" + +#include + namespace storage { /// Log manager per-shard storage probe. @@ -24,6 +28,16 @@ class log_manager_probe { public: void setup_metrics(); void clear_metrics(); + +public: + void housekeeping_log_processed() { ++_housekeeping_log_processed; } + void urgent_gc_run() { ++_urgent_gc_runs; } + +private: + uint64_t _urgent_gc_runs = 0; + uint64_t _housekeeping_log_processed = 0; + + metrics::internal_metric_groups _metrics; }; }; // namespace storage diff --git a/tests/rptest/tests/full_disk_test.py b/tests/rptest/tests/full_disk_test.py index ecc113ed2a15c..37d3641e1cd95 100644 --- a/tests/rptest/tests/full_disk_test.py +++ b/tests/rptest/tests/full_disk_test.py @@ -24,7 +24,7 @@ from rptest.services.admin import Admin from rptest.services.cluster import cluster from rptest.services.kgo_verifier_services import KgoVerifierProducer, KgoVerifierSeqConsumer -from rptest.services.redpanda import LoggingConfig, RedpandaService, SISettings +from rptest.services.redpanda import LoggingConfig, MetricsEndpoint, RedpandaService, SISettings from rptest.tests.end_to_end import EndToEndTest from rptest.tests.redpanda_test import RedpandaTest from rptest.util import produce_total_bytes, search_logs_with_timeout @@ -315,6 +315,17 @@ def observed_data_size(pred): timeout_sec=30, backoff_sec=2) + assert self.redpanda.metric_sum( + metric_name= + "vectorized_storage_manager_housekeeping_log_processed_total", + metrics_endpoint=MetricsEndpoint.METRICS + ) == 0, "Housekeeping should not have run yet" + + assert self.redpanda.metric_sum( + metric_name="vectorized_storage_manager_urgent_gc_runs_total", + metrics_endpoint=MetricsEndpoint.METRICS + ) == 0, "GC should not have run yet" + # now trigger the disk space alert on the same node. unlike the 30 # second delay above, we should almost immediately observe the data # be reclaimed from disk. @@ -331,6 +342,16 @@ def observed_data_size(pred): timeout_sec=10, backoff_sec=2) + assert self.redpanda.metric_sum( + metric_name= + "vectorized_storage_manager_housekeeping_log_processed_total", + metrics_endpoint=MetricsEndpoint.METRICS + ) > 0, "Housekeeping should have run" + + assert self.redpanda.metric_sum( + metric_name="vectorized_storage_manager_urgent_gc_runs_total", + metrics_endpoint=MetricsEndpoint.METRICS) > 0, "GC should have run" + class LocalDiskReportTimeTest(RedpandaTest): topics = (TopicSpec(segment_bytes=2**20, From 66237ee7813926b2b0f11e8d5dcbe97d49afc174 Mon Sep 17 00:00:00 2001 From: Nicolae Vartolomei Date: Tue, 5 Nov 2024 14:51:20 +0000 Subject: [PATCH 3/3] storage: add metric for logs under management (cherry picked from commit 08b6020d63f362726c5be4221e5a769bd2096610) --- src/v/storage/log_manager.cc | 11 +++++++++-- src/v/storage/log_manager.h | 2 ++ src/v/storage/log_manager_probe.cc | 4 ++++ src/v/storage/log_manager_probe.h | 2 ++ 4 files changed, 17 insertions(+), 2 deletions(-) diff --git a/src/v/storage/log_manager.cc b/src/v/storage/log_manager.cc index 011bf1d8cb9a0..f9277f487238f 100644 --- a/src/v/storage/log_manager.cc +++ b/src/v/storage/log_manager.cc @@ -584,7 +584,7 @@ ss::future> log_manager::do_manage( auto [it, success] = _logs.emplace( l->config().ntp(), std::make_unique(l)); _logs_list.push_back(*it->second); - _resources.update_partition_count(_logs.size()); + update_log_count(); vassert(success, "Could not keep track of:{} - concurrency issue", l); co_return l; } @@ -604,7 +604,7 @@ ss::future<> log_manager::remove(model::ntp ntp) { vlog(stlog.info, "Asked to remove: {}", ntp); auto g = _gate.hold(); auto handle = _logs.extract(ntp); - _resources.update_partition_count(_logs.size()); + update_log_count(); if (handle.empty()) { co_return; } @@ -887,4 +887,11 @@ gc_config log_manager::default_gc_config() const { return {collection_threshold, _config.retention_bytes()}; } +void log_manager::update_log_count() { + auto count = _logs.size(); + + _resources.update_partition_count(count); + _probe->set_log_count(count); +} + } // namespace storage diff --git a/src/v/storage/log_manager.h b/src/v/storage/log_manager.h index d344356111f86..2aef5c7d661dd 100644 --- a/src/v/storage/log_manager.h +++ b/src/v/storage/log_manager.h @@ -283,6 +283,8 @@ class log_manager { ss::future<> housekeeping_scan(model::timestamp); + void update_log_count(); + log_config _config; kvstore& _kvstore; storage_resources& _resources; diff --git a/src/v/storage/log_manager_probe.cc b/src/v/storage/log_manager_probe.cc index bf2449e0149c2..e0a87708314e0 100644 --- a/src/v/storage/log_manager_probe.cc +++ b/src/v/storage/log_manager_probe.cc @@ -28,6 +28,10 @@ void log_manager_probe::setup_metrics() { _metrics.add_group( group_name, { + sm::make_gauge( + "logs", + [this] { return _log_count; }, + sm::description("Number of logs managed")), sm::make_counter( "urgent_gc_runs", [this] { return _urgent_gc_runs; }, diff --git a/src/v/storage/log_manager_probe.h b/src/v/storage/log_manager_probe.h index 017f5060f2bba..dcaaf336b21f2 100644 --- a/src/v/storage/log_manager_probe.h +++ b/src/v/storage/log_manager_probe.h @@ -30,10 +30,12 @@ class log_manager_probe { void clear_metrics(); public: + void set_log_count(uint32_t log_count) { _log_count = log_count; } void housekeeping_log_processed() { ++_housekeeping_log_processed; } void urgent_gc_run() { ++_urgent_gc_runs; } private: + uint32_t _log_count = 0; uint64_t _urgent_gc_runs = 0; uint64_t _housekeeping_log_processed = 0;