Skip to content

Commit

Permalink
fix(nemesis): ignore networkd coredumps during BlockNetwork nemesis
Browse files Browse the repository at this point in the history
The change adds reducing of coredump event severity during disrupt_network_block
disruption, if coredump was triggered by an error in networkd service of
version 255 or lower (the root cause is decribed in #9135).

Fixes: #9135
(cherry picked from commit bc96c1c)

# Conflicts:
#	sdcm/nemesis.py
  • Loading branch information
dimakr authored and mergify[bot] committed Dec 10, 2024
1 parent a36509a commit ba74d42
Show file tree
Hide file tree
Showing 15 changed files with 568 additions and 58 deletions.
43 changes: 42 additions & 1 deletion sdcm/coredump.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ class CoreDumpInfo:
download_url: str = ''
command_line: str = ''
executable: str = ''
executable_version: str = ''
process_retry: int = 0

def publish_event(self):
Expand All @@ -52,7 +53,9 @@ def publish_event(self):
corefile_url=self.download_url,
backtrace=self.coredump_info,
download_instructions=self.download_instructions,
source_timestamp=self.source_timestamp
source_timestamp=self.source_timestamp,
executable=self.executable,
executable_version=self.executable_version
).publish()

def __str__(self):
Expand All @@ -70,6 +73,7 @@ def update(self,
download_url: str = None,
command_line: str = None,
executable: str = None,
executable_version: str = None,
process_retry: int = None):
for attr_name, attr_value in {
'node': node,
Expand All @@ -80,6 +84,7 @@ def update(self,
'download_url': download_url,
'command_line': command_line,
'executable': executable,
'executable_version': executable_version,
'process_retry': process_retry,
}.items():
if attr_value is not None:
Expand Down Expand Up @@ -198,6 +203,7 @@ def extract_info_from_core_pids(
break
if found:
continue
self.update_new_coredump_with_exec_information(new_core_info)
self.publish_event(new_core_info)
output.append(new_core_info)
return output
Expand Down Expand Up @@ -299,6 +305,41 @@ def n_coredumps(self) -> int:
def update_coredump_info_with_more_information(self, core_info: CoreDumpInfo):
pass

def _get_core_by_pid(self, pid: str) -> Optional[dict]:
result = self.node.remoter.sudo(f"coredumpctl list {pid} -q --json=short", verbose=False, ignore_status=True)
if not result.ok:
return None

try:
cores_info = json.loads(result.stdout)
except json.JSONDecodeError:
self.log.warning("couldn't parse:\n %s", result.stdout)
return None
return cores_info[0]

def _get_executable_version(self, executable: str) -> Optional[str]:
if self.node.distro.is_rhel_like:
pkg = self.node.remoter.run(f"rpm -qf {executable}", ignore_status=True).stdout.strip()
release_version = self.node.remoter.run(
f"rpm -q --queryformat '%{{VERSION}}' {pkg}", ignore_status=True).stdout.strip()
elif self.node.distro.is_ubuntu or self.node.distro.is_debian:
pkg = self.node.remoter.sudo(f"dpkg -S {executable}", ignore_status=True).stdout.split(':')[0].strip()
release_version = self.node.remoter.run(
f"dpkg-query --showformat='${{Version}}' --show {pkg}", ignore_status=True).stdout.strip()
else:
raise RuntimeError("Distro is not supported")

return self._extract_version(release_version)

def update_new_coredump_with_exec_information(self, core_info: CoreDumpInfo) -> None:
core = self._get_core_by_pid(core_info.pid)
core_info.update(executable=core['exe'], executable_version=self._get_executable_version(core['exe']))

@staticmethod
def _extract_version(release_version: str) -> Optional[str]:
match = re.match(r"^([\d]+\.[\d]+(?:\.[\d]+)?)", release_version)
return match.group(1) if match else None


class CoredumpExportSystemdThread(CoredumpThreadBase):
"""
Expand Down
29 changes: 22 additions & 7 deletions sdcm/nemesis.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@
from sdcm.sct_events.health import DataValidatorEvent
from sdcm.sct_events.loaders import CassandraStressLogEvent, ScyllaBenchEvent
from sdcm.sct_events.nemesis import DisruptionEvent
from sdcm.sct_events.system import InfoEvent
from sdcm.sct_events.system import InfoEvent, CoreDumpEvent
from sdcm.sla.sla_tests import SlaTests
from sdcm.utils.argus import get_argus_client
from sdcm.utils.aws_kms import AwsKms
Expand Down Expand Up @@ -132,7 +132,12 @@
from sdcm.utils.sstable.sstable_utils import SstableUtils
from sdcm.utils.tablets.common import wait_for_tablets_balanced
from sdcm.utils.toppartition_util import NewApiTopPartitionCmd, OldApiTopPartitionCmd
<<<<<<< HEAD
from sdcm.utils.version_utils import MethodVersionNotFound, scylla_versions
=======
from sdcm.utils.version_utils import (
MethodVersionNotFound, scylla_versions, ComparableScyllaVersion, get_systemd_version)
>>>>>>> bc96c1c2d (fix(nemesis): ignore networkd coredumps during BlockNetwork nemesis)
from sdcm.utils.raft import Group0MembersNotConsistentWithTokenRingMembersException, TopologyOperations
from sdcm.utils.raft.common import NodeBootstrapAbortManager
from sdcm.utils.issues import SkipPerIssues
Expand Down Expand Up @@ -3495,16 +3500,26 @@ def disrupt_network_block(self):
if not self.target_node.install_traffic_control():
raise UnsupportedNemesis("Traffic control package not installed on system")

systemd_version = get_systemd_version(
self.target_node.remoter.run("systemctl --version", ignore_status=True).stdout)
if systemd_version < 256:
context_manager = EventsSeverityChangerFilter(
new_severity=Severity.WARNING, event_class=CoreDumpEvent, regex=r".*executable=.*networkd.*",
extra_time_to_expiration=60)
else:
context_manager = contextlib.nullcontext()

selected_option = "--loss 100%"
wait_time = random.choice(list_of_timeout_options)
self.log.debug("BlockNetwork: [%s] for %dsec", selected_option, wait_time)
self.target_node.traffic_control(None)
try:
self.target_node.traffic_control(selected_option)
time.sleep(wait_time)
finally:
with context_manager:
self.target_node.traffic_control(None)
self.cluster.wait_all_nodes_un()
try:
self.target_node.traffic_control(selected_option)
time.sleep(wait_time)
finally:
self.target_node.traffic_control(None)
self.cluster.wait_all_nodes_un()

def disrupt_remove_node_then_add_node(self): # pylint: disable=too-many-branches
"""
Expand Down
11 changes: 10 additions & 1 deletion sdcm/sct_events/system.py
Original file line number Diff line number Diff line change
Expand Up @@ -221,14 +221,18 @@ def __init__(self,
corefile_url: str,
backtrace: str,
download_instructions: str,
source_timestamp: Optional[float] = None):
source_timestamp: Optional[float] = None,
executable: Optional[str] = None,
executable_version: Optional[str] = None):

super().__init__(severity=Severity.ERROR)

self.node = str(node)
self.corefile_url = corefile_url
self.backtrace = backtrace
self.download_instructions = download_instructions
self.executable = executable
self.executable_version = executable_version
if source_timestamp is not None:
self.source_timestamp = source_timestamp

Expand All @@ -245,6 +249,11 @@ def msgfmt(self) -> str:
fmt += "Info about modules can be found in SCT logs by search for 'Coredump Modules info'\n"
if self.download_instructions:
fmt += "download_instructions:\n{0.download_instructions}\n"
if self.executable:
fmt += "executable={0.executable}"
if self.executable_version:
fmt += " executable_version={0.executable_version}"
fmt += "\n"

return fmt

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -234,5 +234,59 @@
"exited": 0,
"exit_status": 0
}
],
"sudo coredumpctl list 5711 -q --json=short" : [
{
"__instance__": "fabric.runners.Result",
"stdout": "[{\"time\":1733783727661621,\"pid\":609,\"uid\":998,\"gid\":998,\"sig\":11,\"corefile\":\"present\",\"exe\":\"/usr/bin/scylla\",\"size\":776650}]\n",
"stderr": "",
"exited": 0,
"exit_status": 0
}
],
"sudo coredumpctl list 41537 -q --json=short" : [
{
"__instance__": "fabric.runners.Result",
"stdout": "[{\"time\":1733783727661631,\"pid\":709,\"uid\":998,\"gid\":998,\"sig\":11,\"corefile\":\"present\",\"exe\":\"/usr/bin/bsh\",\"size\":776660}]\n",
"stderr": "",
"exited": 0,
"exit_status": 0
}
],
"sudo dpkg -S /usr/bin/scylla" : [
{
"__instance__": "fabric.runners.Result",
"stdout": "scylla-server: /usr/bin/scylla\n",
"stderr": "",
"exited": 0,
"exit_status": 0
}
],
"sudo dpkg -S /usr/bin/bsh" : [
{
"__instance__": "fabric.runners.Result",
"stdout": "bash: /usr/bin/bsh\n",
"stderr": "",
"exited": 0,
"exit_status": 0
}
],
"dpkg-query --showformat='${Version}' --show scylla-server" : [
{
"__instance__": "fabric.runners.Result",
"stdout": "6.3.0~dev-0.20241208.f744007e1365-1\n",
"stderr": "",
"exited": 0,
"exit_status": 0
}
],
"dpkg-query --showformat='${Version}' --show bash" : [
{
"__instance__": "fabric.runners.Result",
"stdout": "5.2.21-2ubuntu4\n",
"stderr": "",
"exited": 0,
"exit_status": 0
}
]
}
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,8 @@
"download_instructions": "failed to upload core",
"download_url": "",
"command_line": "/usr/bin/scylla --log-to-syslog 0 --log-to-stdout 1 --default-log-level info --",
"executable": ""
"executable": "/usr/bin/scylla",
"executable_version": "6.3.0"
},
{
"__instance__": "sdcm.coredump.CoreDumpInfo",
Expand All @@ -20,7 +21,8 @@
"download_instructions": "gsutil cp gs://upload.scylladb.com/ac7d8023a369-41537-0-0-11-1600150672.core/ac7d8023a369-41537-0-0-11-1600150672.core.gz .\ngunzip ac7d8023a369-41537-0-0-11-1600150672.core.gz",
"download_url": "https://storage.cloud.google.com/upload.scylladb.com/ac7d8023a369-41537-0-0-11-1600150672.core/ac7d8023a369-41537-0-0-11-1600150672.core.gz",
"command_line": "/bin/bash /scylla-housekeeping-service.sh",
"executable": ""
"executable": "/usr/bin/bsh",
"executable_version": "5.2.21"
}
],
"in_progress": [],
Expand All @@ -34,7 +36,8 @@
"download_instructions": "gsutil cp gs://upload.scylladb.com/ac7d8023a369-41537-0-0-11-1600150672.core/ac7d8023a369-41537-0-0-11-1600150672.core.gz .\ngunzip ac7d8023a369-41537-0-0-11-1600150672.core.gz",
"download_url": "https://storage.cloud.google.com/upload.scylladb.com/ac7d8023a369-41537-0-0-11-1600150672.core/ac7d8023a369-41537-0-0-11-1600150672.core.gz",
"command_line": "/bin/bash /scylla-housekeeping-service.sh",
"executable": ""
"executable": "/usr/bin/bsh",
"executable_version": "5.2.21"
},
{
"__instance__": "sdcm.coredump.CoreDumpInfo",
Expand All @@ -45,7 +48,8 @@
"download_instructions": "failed to upload core",
"download_url": "",
"command_line": "/usr/bin/scylla --log-to-syslog 0 --log-to-stdout 1 --default-log-level info --",
"executable": ""
"executable": "/usr/bin/scylla",
"executable_version": "6.3.0"
}
],
"uploaded": [
Expand All @@ -58,7 +62,8 @@
"download_instructions": "gsutil cp gs://upload.scylladb.com/ac7d8023a369-41537-0-0-11-1600150672.core/ac7d8023a369-41537-0-0-11-1600150672.core.gz .\ngunzip ac7d8023a369-41537-0-0-11-1600150672.core.gz",
"download_url": "https://storage.cloud.google.com/upload.scylladb.com/ac7d8023a369-41537-0-0-11-1600150672.core/ac7d8023a369-41537-0-0-11-1600150672.core.gz",
"command_line": "/bin/bash /scylla-housekeeping-service.sh",
"executable": ""
"executable": "/usr/bin/bsh",
"executable_version": "5.2.21"
}
]
}
Original file line number Diff line number Diff line change
Expand Up @@ -241,5 +241,59 @@
"exited": 0,
"exit_status": 0
}
],
"sudo coredumpctl list 5711 -q --json=short" : [
{
"__instance__": "fabric.runners.Result",
"stdout": "[{\"time\":1733783727661621,\"pid\":609,\"uid\":998,\"gid\":998,\"sig\":11,\"corefile\":\"present\",\"exe\":\"/usr/bin/scylla\",\"size\":776650}]\n",
"stderr": "",
"exited": 0,
"exit_status": 0
}
],
"sudo coredumpctl list 41537 -q --json=short" : [
{
"__instance__": "fabric.runners.Result",
"stdout": "[{\"time\":1733783727661631,\"pid\":709,\"uid\":998,\"gid\":998,\"sig\":11,\"corefile\":\"present\",\"exe\":\"/usr/bin/bsh\",\"size\":776660}]\n",
"stderr": "",
"exited": 0,
"exit_status": 0
}
],
"sudo dpkg -S /usr/bin/scylla" : [
{
"__instance__": "fabric.runners.Result",
"stdout": "scylla-server: /usr/bin/scylla\n",
"stderr": "",
"exited": 0,
"exit_status": 0
}
],
"sudo dpkg -S /usr/bin/bsh" : [
{
"__instance__": "fabric.runners.Result",
"stdout": "bash: /usr/bin/bsh\n",
"stderr": "",
"exited": 0,
"exit_status": 0
}
],
"dpkg-query --showformat='${Version}' --show scylla-server" : [
{
"__instance__": "fabric.runners.Result",
"stdout": "6.3.0~dev-0.20241208.f744007e1365-1\n",
"stderr": "",
"exited": 0,
"exit_status": 0
}
],
"dpkg-query --showformat='${Version}' --show bash" : [
{
"__instance__": "fabric.runners.Result",
"stdout": "5.2.21-2ubuntu4\n",
"stderr": "",
"exited": 0,
"exit_status": 0
}
]
}
Loading

0 comments on commit ba74d42

Please sign in to comment.