From 68544d5bb33edb601cd3db45c1bf4b8a0e82c0d3 Mon Sep 17 00:00:00 2001 From: Benny Halevy Date: Mon, 27 May 2024 11:39:58 +0300 Subject: [PATCH] repair_service: debug stop Seen the following unexplained assertion failure with pytest -s -v --scylla-version=local_tarball --tablets repair_additional_test.py::TestRepairAdditional::test_repair_option_pr_multi_dc ``` INFO 2024-05-27 11:18:05,081 [shard 0:main] init - Shutting down repair service INFO 2024-05-27 11:18:05,081 [shard 0:main] task_manager - Stopping module repair INFO 2024-05-27 11:18:05,081 [shard 0:main] task_manager - Unregistered module repair INFO 2024-05-27 11:18:05,081 [shard 1:main] task_manager - Stopping module repair INFO 2024-05-27 11:18:05,081 [shard 1:main] task_manager - Unregistered module repair scylla: repair/row_level.cc:3230: repair_service::~repair_service(): Assertion `_stopped' failed. Aborting on shard 0. Backtrace: /home/bhalevy/.ccm/scylla-repository/local_tarball/libreloc/libseastar.so+0x3f040c /home/bhalevy/.ccm/scylla-repository/local_tarball/libreloc/libseastar.so+0x41c7a1 /home/bhalevy/.ccm/scylla-repository/local_tarball/libreloc/libc.so.6+0x3dbaf /home/bhalevy/.ccm/scylla-repository/local_tarball/libreloc/libc.so.6+0x8e883 /home/bhalevy/.ccm/scylla-repository/local_tarball/libreloc/libc.so.6+0x3dafd /home/bhalevy/.ccm/scylla-repository/local_tarball/libreloc/libc.so.6+0x2687e /home/bhalevy/.ccm/scylla-repository/local_tarball/libreloc/libc.so.6+0x2679a /home/bhalevy/.ccm/scylla-repository/local_tarball/libreloc/libc.so.6+0x36186 0x26f2428 0x10fb373 0x10fc8b8 0x10fc809 /home/bhalevy/.ccm/scylla-repository/local_tarball/libreloc/libseastar.so+0x456c6d /home/bhalevy/.ccm/scylla-repository/local_tarball/libreloc/libseastar.so+0x456bcf 0x10fc65b 0x10fc5bc 0x10808d0 0x1080800 /home/bhalevy/.ccm/scylla-repository/local_tarball/libreloc/libseastar.so+0x3ff22f /home/bhalevy/.ccm/scylla-repository/local_tarball/libreloc/libseastar.so+0x4003b7 /home/bhalevy/.ccm/scylla-repository/local_tarball/libreloc/libseastar.so+0x3ff888 /home/bhalevy/.ccm/scylla-repository/local_tarball/libreloc/libseastar.so+0x36dea8 /home/bhalevy/.ccm/scylla-repository/local_tarball/libreloc/libseastar.so+0x36d0e2 0x101cefa 0x105a390 0x101bde7 /home/bhalevy/.ccm/scylla-repository/local_tarball/libreloc/libc.so.6+0x27b89 /home/bhalevy/.ccm/scylla-repository/local_tarball/libreloc/libc.so.6+0x27c4a 0x101a764 ``` Decoded: ``` ~repair_service at ./repair/row_level.cc:3230 ~shared_ptr_count_for at ././seastar/include/seastar/core/shared_ptr.hh:491 (inlined by) ~shared_ptr_count_for at ././seastar/include/seastar/core/shared_ptr.hh:491 ~shared_ptr at ././seastar/include/seastar/core/shared_ptr.hh:569 (inlined by) seastar::shared_ptr::operator=(seastar::shared_ptr&&) at ././seastar/include/seastar/core/shared_ptr.hh:582 (inlined by) seastar::shared_ptr::operator=(decltype(nullptr)) at ././seastar/include/seastar/core/shared_ptr.hh:588 (inlined by) operator() at ././seastar/include/seastar/core/sharded.hh:727 (inlined by) seastar::future seastar::futurize >::invoke::stop()::{lambda(seastar::future)#1}::operator()(seastar::future) const::{lambda(unsigned int)#1}::operator()(unsigned int) const::{lambda()#1}&>(seastar::sharded::stop()::{lambda(seastar::future)#1}::operator()(seastar::future) const::{lambda(unsigned int)#1}::operator()(unsigned int) const::{lambda()#1}&) at ././seastar/include/seastar/core/future.hh:2035 (inlined by) seastar::futurize::stop()::{lambda(seastar::future)#1}::operator()(seastar::future) const::{lambda(unsigned int)#1}::operator()(unsigned int) const::{lambda()#1}>::type>::type seastar::smp::submit_to::stop()::{lambda(seastar::future)#1}::operator()(seastar::future) const::{lambda(unsigned int)#1}::operator()(unsigned int) const::{lambda()#1}>(unsigned int, seastar::smp_submit_to_options, seastar::sharded::stop()::{lambda(seastar::future)#1}::operator()(seastar::future) const::{lambda(unsigned int)#1}::operator()(unsigned int) const::{lambda()#1}&&) at ././seastar/include/seastar/core/smp.hh:367 seastar::futurize::stop()::{lambda(seastar::future)#1}::operator()(seastar::future) const::{lambda(unsigned int)#1}::operator()(unsigned int) const::{lambda()#1}>::type>::type seastar::smp::submit_to::stop()::{lambda(seastar::future)#1}::operator()(seastar::future) const::{lambda(unsigned int)#1}::operator()(unsigned int) const::{lambda()#1}>(unsigned int, seastar::sharded::stop()::{lambda(seastar::future)#1}::operator()(seastar::future) const::{lambda(unsigned int)#1}::operator()(unsigned int) const::{lambda()#1}&&) at ././seastar/include/seastar/core/smp.hh:394 (inlined by) operator() at ././seastar/include/seastar/core/sharded.hh:725 (inlined by) seastar::future std::__invoke_impl, seastar::sharded::stop()::{lambda(seastar::future)#1}::operator()(seastar::future) const::{lambda(unsigned int)#1}&, unsigned int>(std::__invoke_other, seastar::sharded::stop()::{lambda(seastar::future)#1}::operator()(seastar::future) const::{lambda(unsigned int)#1}&, unsigned int&&) at /usr/bin/../lib/gcc/x86_64-redhat-linux/13/../../../../include/c++/13/bits/invoke.h:61 (inlined by) std::enable_if, seastar::sharded::stop()::{lambda(seastar::future)#1}::operator()(seastar::future) const::{lambda(unsigned int)#1}&, unsigned int>, seastar::future >::type std::__invoke_r, seastar::sharded::stop()::{lambda(seastar::future)#1}::operator()(seastar::future) const::{lambda(unsigned int)#1}&, unsigned int>(seastar::sharded::stop()::{lambda(seastar::future)#1}::operator()(seastar::future) const::{lambda(unsigned int)#1}&, unsigned int&&) at /usr/bin/../lib/gcc/x86_64-redhat-linux/13/../../../../include/c++/13/bits/invoke.h:114 (inlined by) std::_Function_handler (unsigned int), seastar::sharded::stop()::{lambda(seastar::future)#1}::operator()(seastar::future) const::{lambda(unsigned int)#1}>::_M_invoke(std::_Any_data const&, unsigned int&&) at /usr/bin/../lib/gcc/x86_64-redhat-linux/13/../../../../include/c++/13/bits/std_function.h:290 ``` FWIW, gdb crashed when opening the coredump. This commit will help catch the issue earlier when repair_service::stop() fails (and it must never fail) Signed-off-by: Benny Halevy (cherry picked from commit 38845754c4d99a41c9bed917c76579ef8567c89c) --- repair/row_level.cc | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/repair/row_level.cc b/repair/row_level.cc index 777daa6442a4..4bc82c227732 100644 --- a/repair/row_level.cc +++ b/repair/row_level.cc @@ -3217,13 +3217,22 @@ future<> repair_service::start() { } future<> repair_service::stop() { + try { + rlogger.debug("Stopping repair task module"); co_await _repair_module->stop(); + rlogger.debug("Waiting on load_history_done"); co_await std::move(_load_history_done); + rlogger.debug("Uninitializing messaging service handlers"); co_await uninit_ms_handlers(); if (this_shard_id() == 0) { + rlogger.debug("Unregistering gossiper helper"); co_await _gossiper.local().unregister_(_gossip_helper); } _stopped = true; + rlogger.info("Stopped repair_service"); + } catch (...) { + on_fatal_internal_error(rlogger, format("Failed stopping repair_service: {}", std::current_exception())); + } } repair_service::~repair_service() {