diff --git a/src/v/raft/consensus.cc b/src/v/raft/consensus.cc index dfbbcec98539f..0d3e59d8426ba 100644 --- a/src/v/raft/consensus.cc +++ b/src/v/raft/consensus.cc @@ -3955,7 +3955,7 @@ reply_result consensus::lightweight_heartbeat( target_node, _self, source_node); - return reply_result::failure; + return reply_result::group_unavailable; } /** @@ -4010,7 +4010,7 @@ ss::future consensus::full_heartbeat( target_vnode, _self, source_vnode); - reply.result = reply_result::failure; + reply.result = reply_result::group_unavailable; co_return reply; } /** diff --git a/src/v/raft/heartbeat_manager.cc b/src/v/raft/heartbeat_manager.cc index 34b6f77ee31e7..45b8d82bc6ada 100644 --- a/src/v/raft/heartbeat_manager.cc +++ b/src/v/raft/heartbeat_manager.cc @@ -340,6 +340,17 @@ void heartbeat_manager::process_reply( return; } auto& reply = r.value(); + + if (reply.source() != n) { + vlog( + raftlog.warn, + "got heartbeat reply from a different node id {} (expected {}), " + "ignoring", + reply.source(), + n); + return; + } + reply.for_each_lw_reply([this, n, target = reply.target(), &groups]( group_id group, reply_result result) { auto it = _consensus_groups.find(group); diff --git a/tests/rptest/tests/admin_uuid_operations_test.py b/tests/rptest/tests/admin_uuid_operations_test.py index 80261467b976e..49a9fb5785e9a 100644 --- a/tests/rptest/tests/admin_uuid_operations_test.py +++ b/tests/rptest/tests/admin_uuid_operations_test.py @@ -276,14 +276,12 @@ def test_force_uuid_override(self, mode): backoff_sec=2, err_msg=f"{to_stop.name} did not take the UUID override") - self.logger.debug(f"Wait for the cluster to become healthy...") + self.logger.debug(f"Decommission ghost node [{ghost_node_id}]...") + self._decommission(ghost_node_id) + self.logger.debug(f"...and wait for the cluster to become healthy.") self.wait_until_cluster_healthy(timeout_sec=30) - self.logger.debug( - f".. and decommission ghost node [{ghost_node_id}]...") - self._decommission(ghost_node_id) - self.logger.debug( "Check that all this state sticks across a rolling restart") @@ -373,14 +371,11 @@ def test_force_uuid_override_multinode(self, mode): auto_assign_node_id=True, ) - self.logger.debug("Wait for the cluster to become healthy...") + self.logger.debug(f"Decommission ghost node [{ghost_node_id}]...") + self._decommission(ghost_node_id) + self.logger.debug("...and wait for the cluster to become healthy.") controller_leader = self.wait_until_cluster_healthy(timeout_sec=30) assert controller_leader is not None, "Didn't elect a controller leader" assert controller_leader not in to_stop, f"Unexpected controller leader {controller_leader.account.hostname}" - - self.logger.debug( - f"...and decommission ghost node [{ghost_node_id}]...") - - self._decommission(ghost_node_id)