Skip to content

Commit

Permalink
Merge pull request ClickHouse#72402 from Algunenano/keeper_fault
Browse files Browse the repository at this point in the history
Better error message on bad keeper snapshots
  • Loading branch information
Algunenano authored Dec 2, 2024
2 parents 62ed867 + 575e177 commit 0a5a0a2
Show file tree
Hide file tree
Showing 3 changed files with 27 additions and 12 deletions.
22 changes: 14 additions & 8 deletions src/Coordination/KeeperStateMachine.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -142,13 +142,14 @@ void KeeperStateMachine<Storage>::init()
}
catch (...)
{
tryLogCurrentException(
LOG_FATAL(
log,
fmt::format(
"Aborting because of failure to load from latest snapshot with index {}. Problematic snapshot can be removed but it will "
"lead to data loss",
latest_log_index));
std::abort();
"Failure to load from latest snapshot with index {}: {}",
latest_log_index,
getCurrentExceptionMessage(true, true, false));
LOG_FATAL(
log, "Manual intervention is necessary for recovery. Problematic snapshot can be removed but it will lead to data loss");
abort();
}
}

Expand Down Expand Up @@ -427,8 +428,13 @@ bool KeeperStateMachine<Storage>::preprocess(const KeeperStorageBase::RequestFor
}
catch (...)
{
tryLogCurrentException(__PRETTY_FUNCTION__, fmt::format("Failed to preprocess stored log at index {}, aborting to avoid inconsistent state", request_for_session.log_idx));
std::abort();
LOG_FATAL(
log,
"Failed to preprocess stored log at index {}: {}",
request_for_session.log_idx,
getCurrentExceptionMessage(true, true, false));
LOG_FATAL(log, "Aborting to avoid inconsistent state");
abort();
}

if (keeper_context->digestEnabled() && request_for_session.digest)
Expand Down
14 changes: 11 additions & 3 deletions src/Coordination/SnapshotableHashTable.h
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,11 @@
namespace DB
{

namespace ErrorCodes
{
extern const int LOGICAL_ERROR;
}

template<typename V>
struct ListNode
{
Expand Down Expand Up @@ -292,7 +297,8 @@ class SnapshotableHashTable
{
size_t hash_value = map.hash(key);
auto it = map.find(key, hash_value);
chassert(it != map.end());
if (it == map.end())
throw Exception(ErrorCodes::LOGICAL_ERROR, "Could not find key: '{}'", key);

auto list_itr = it->getMapped();
uint64_t old_value_size = list_itr->value.sizeInBytes();
Expand Down Expand Up @@ -348,15 +354,17 @@ class SnapshotableHashTable
const V & getValue(StringRef key) const
{
auto it = map.find(key);
chassert(it);
if (it == map.end())
throw Exception(ErrorCodes::LOGICAL_ERROR, "Could not find key: '{}'", key);
return it->getMapped()->value;
}

void clearOutdatedNodes()
{
for (auto & itr : snapshot_invalid_iters)
{
chassert(!itr->isActiveInMap());
if (itr->isActiveInMap())
throw Exception(ErrorCodes::LOGICAL_ERROR, "{} is not active in map", itr->key);
updateDataSize(ERASE, itr->key.size, 0, itr->value.sizeInBytes(), /*remove_old=*/true);
if (itr->getFreeKey())
arena.free(const_cast<char *>(itr->key.data), itr->key.size);
Expand Down
3 changes: 2 additions & 1 deletion tests/integration/test_keeper_snapshots/test.py
Original file line number Diff line number Diff line change
Expand Up @@ -191,8 +191,9 @@ def snapshot_sort_key(snapshot_name):
]
)
node.start_clickhouse(start_wait_sec=120, expected_to_fail=True)
assert node.contains_in_log("Failure to load from latest snapshot with index")
assert node.contains_in_log(
"Aborting because of failure to load from latest snapshot with index"
"Manual intervention is necessary for recovery. Problematic snapshot can be removed but it will lead to data loss"
)

node.stop_clickhouse()
Expand Down

0 comments on commit 0a5a0a2

Please sign in to comment.