Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Backport 6.2] fix(nemesis): ignore networkd coredumps during BlockNetwork nemesis #9528

Merged
merged 1 commit into from
Dec 11, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
43 changes: 42 additions & 1 deletion sdcm/coredump.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ class CoreDumpInfo:
download_url: str = ''
command_line: str = ''
executable: str = ''
executable_version: str = ''
process_retry: int = 0

def publish_event(self):
Expand All @@ -52,7 +53,9 @@ def publish_event(self):
corefile_url=self.download_url,
backtrace=self.coredump_info,
download_instructions=self.download_instructions,
source_timestamp=self.source_timestamp
source_timestamp=self.source_timestamp,
executable=self.executable,
executable_version=self.executable_version
).publish()

def __str__(self):
Expand All @@ -70,6 +73,7 @@ def update(self,
download_url: str = None,
command_line: str = None,
executable: str = None,
executable_version: str = None,
process_retry: int = None):
for attr_name, attr_value in {
'node': node,
Expand All @@ -80,6 +84,7 @@ def update(self,
'download_url': download_url,
'command_line': command_line,
'executable': executable,
'executable_version': executable_version,
'process_retry': process_retry,
}.items():
if attr_value is not None:
Expand Down Expand Up @@ -198,6 +203,7 @@ def extract_info_from_core_pids(
break
if found:
continue
self.update_new_coredump_with_exec_information(new_core_info)
self.publish_event(new_core_info)
output.append(new_core_info)
return output
Expand Down Expand Up @@ -299,6 +305,41 @@ def n_coredumps(self) -> int:
def update_coredump_info_with_more_information(self, core_info: CoreDumpInfo):
pass

def _get_core_by_pid(self, pid: str) -> Optional[dict]:
result = self.node.remoter.sudo(f"coredumpctl list {pid} -q --json=short", verbose=False, ignore_status=True)
if not result.ok:
return None

try:
cores_info = json.loads(result.stdout)
except json.JSONDecodeError:
self.log.warning("couldn't parse:\n %s", result.stdout)
return None
return cores_info[0]

def _get_executable_version(self, executable: str) -> Optional[str]:
if self.node.distro.is_rhel_like:
pkg = self.node.remoter.run(f"rpm -qf {executable}", ignore_status=True).stdout.strip()
release_version = self.node.remoter.run(
f"rpm -q --queryformat '%{{VERSION}}' {pkg}", ignore_status=True).stdout.strip()
elif self.node.distro.is_ubuntu or self.node.distro.is_debian:
pkg = self.node.remoter.sudo(f"dpkg -S {executable}", ignore_status=True).stdout.split(':')[0].strip()
release_version = self.node.remoter.run(
f"dpkg-query --showformat='${{Version}}' --show {pkg}", ignore_status=True).stdout.strip()
else:
raise RuntimeError("Distro is not supported")

return self._extract_version(release_version)

def update_new_coredump_with_exec_information(self, core_info: CoreDumpInfo) -> None:
core = self._get_core_by_pid(core_info.pid)
core_info.update(executable=core['exe'], executable_version=self._get_executable_version(core['exe']))

@staticmethod
def _extract_version(release_version: str) -> Optional[str]:
match = re.match(r"^([\d]+\.[\d]+(?:\.[\d]+)?)", release_version)
return match.group(1) if match else None


class CoredumpExportSystemdThread(CoredumpThreadBase):
"""
Expand Down
26 changes: 18 additions & 8 deletions sdcm/nemesis.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@
from sdcm.sct_events.health import DataValidatorEvent
from sdcm.sct_events.loaders import CassandraStressLogEvent, ScyllaBenchEvent
from sdcm.sct_events.nemesis import DisruptionEvent
from sdcm.sct_events.system import InfoEvent
from sdcm.sct_events.system import InfoEvent, CoreDumpEvent
from sdcm.sla.sla_tests import SlaTests
from sdcm.utils.argus import get_argus_client
from sdcm.utils.aws_kms import AwsKms
Expand Down Expand Up @@ -132,7 +132,7 @@
from sdcm.utils.sstable.sstable_utils import SstableUtils
from sdcm.utils.tablets.common import wait_for_tablets_balanced
from sdcm.utils.toppartition_util import NewApiTopPartitionCmd, OldApiTopPartitionCmd
from sdcm.utils.version_utils import MethodVersionNotFound, scylla_versions
from sdcm.utils.version_utils import MethodVersionNotFound, scylla_versions, get_systemd_version
from sdcm.utils.raft import Group0MembersNotConsistentWithTokenRingMembersException, TopologyOperations
from sdcm.utils.raft.common import NodeBootstrapAbortManager
from sdcm.utils.issues import SkipPerIssues
Expand Down Expand Up @@ -3572,16 +3572,26 @@ def disrupt_network_block(self):
if not self.target_node.install_traffic_control():
raise UnsupportedNemesis("Traffic control package not installed on system")

systemd_version = get_systemd_version(
self.target_node.remoter.run("systemctl --version", ignore_status=True).stdout)
if systemd_version < 256:
context_manager = EventsSeverityChangerFilter(
new_severity=Severity.WARNING, event_class=CoreDumpEvent, regex=r".*executable=.*networkd.*",
extra_time_to_expiration=60)
else:
context_manager = contextlib.nullcontext()

selected_option = "--loss 100%"
wait_time = random.choice(list_of_timeout_options)
self.log.debug("BlockNetwork: [%s] for %dsec", selected_option, wait_time)
self.target_node.traffic_control(None)
try:
self.target_node.traffic_control(selected_option)
time.sleep(wait_time)
finally:
with context_manager:
self.target_node.traffic_control(None)
self.cluster.wait_all_nodes_un()
try:
self.target_node.traffic_control(selected_option)
time.sleep(wait_time)
finally:
self.target_node.traffic_control(None)
self.cluster.wait_all_nodes_un()

@target_data_nodes
def disrupt_remove_node_then_add_node(self): # pylint: disable=too-many-branches
Expand Down
11 changes: 10 additions & 1 deletion sdcm/sct_events/system.py
Original file line number Diff line number Diff line change
Expand Up @@ -221,14 +221,18 @@ def __init__(self,
corefile_url: str,
backtrace: str,
download_instructions: str,
source_timestamp: Optional[float] = None):
source_timestamp: Optional[float] = None,
executable: Optional[str] = None,
executable_version: Optional[str] = None):

super().__init__(severity=Severity.ERROR)

self.node = str(node)
self.corefile_url = corefile_url
self.backtrace = backtrace
self.download_instructions = download_instructions
self.executable = executable
self.executable_version = executable_version
if source_timestamp is not None:
self.source_timestamp = source_timestamp

Expand All @@ -245,6 +249,11 @@ def msgfmt(self) -> str:
fmt += "Info about modules can be found in SCT logs by search for 'Coredump Modules info'\n"
if self.download_instructions:
fmt += "download_instructions:\n{0.download_instructions}\n"
if self.executable:
fmt += "executable={0.executable}"
if self.executable_version:
fmt += " executable_version={0.executable_version}"
fmt += "\n"

return fmt

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -234,5 +234,59 @@
"exited": 0,
"exit_status": 0
}
],
"sudo coredumpctl list 5711 -q --json=short" : [
{
"__instance__": "fabric.runners.Result",
"stdout": "[{\"time\":1733783727661621,\"pid\":609,\"uid\":998,\"gid\":998,\"sig\":11,\"corefile\":\"present\",\"exe\":\"/usr/bin/scylla\",\"size\":776650}]\n",
"stderr": "",
"exited": 0,
"exit_status": 0
}
],
"sudo coredumpctl list 41537 -q --json=short" : [
{
"__instance__": "fabric.runners.Result",
"stdout": "[{\"time\":1733783727661631,\"pid\":709,\"uid\":998,\"gid\":998,\"sig\":11,\"corefile\":\"present\",\"exe\":\"/usr/bin/bsh\",\"size\":776660}]\n",
"stderr": "",
"exited": 0,
"exit_status": 0
}
],
"sudo dpkg -S /usr/bin/scylla" : [
{
"__instance__": "fabric.runners.Result",
"stdout": "scylla-server: /usr/bin/scylla\n",
"stderr": "",
"exited": 0,
"exit_status": 0
}
],
"sudo dpkg -S /usr/bin/bsh" : [
{
"__instance__": "fabric.runners.Result",
"stdout": "bash: /usr/bin/bsh\n",
"stderr": "",
"exited": 0,
"exit_status": 0
}
],
"dpkg-query --showformat='${Version}' --show scylla-server" : [
{
"__instance__": "fabric.runners.Result",
"stdout": "6.3.0~dev-0.20241208.f744007e1365-1\n",
"stderr": "",
"exited": 0,
"exit_status": 0
}
],
"dpkg-query --showformat='${Version}' --show bash" : [
{
"__instance__": "fabric.runners.Result",
"stdout": "5.2.21-2ubuntu4\n",
"stderr": "",
"exited": 0,
"exit_status": 0
}
]
}
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,8 @@
"download_instructions": "failed to upload core",
"download_url": "",
"command_line": "/usr/bin/scylla --log-to-syslog 0 --log-to-stdout 1 --default-log-level info --",
"executable": ""
"executable": "/usr/bin/scylla",
"executable_version": "6.3.0"
},
{
"__instance__": "sdcm.coredump.CoreDumpInfo",
Expand All @@ -20,7 +21,8 @@
"download_instructions": "gsutil cp gs://upload.scylladb.com/ac7d8023a369-41537-0-0-11-1600150672.core/ac7d8023a369-41537-0-0-11-1600150672.core.gz .\ngunzip ac7d8023a369-41537-0-0-11-1600150672.core.gz",
"download_url": "https://storage.cloud.google.com/upload.scylladb.com/ac7d8023a369-41537-0-0-11-1600150672.core/ac7d8023a369-41537-0-0-11-1600150672.core.gz",
"command_line": "/bin/bash /scylla-housekeeping-service.sh",
"executable": ""
"executable": "/usr/bin/bsh",
"executable_version": "5.2.21"
}
],
"in_progress": [],
Expand All @@ -34,7 +36,8 @@
"download_instructions": "gsutil cp gs://upload.scylladb.com/ac7d8023a369-41537-0-0-11-1600150672.core/ac7d8023a369-41537-0-0-11-1600150672.core.gz .\ngunzip ac7d8023a369-41537-0-0-11-1600150672.core.gz",
"download_url": "https://storage.cloud.google.com/upload.scylladb.com/ac7d8023a369-41537-0-0-11-1600150672.core/ac7d8023a369-41537-0-0-11-1600150672.core.gz",
"command_line": "/bin/bash /scylla-housekeeping-service.sh",
"executable": ""
"executable": "/usr/bin/bsh",
"executable_version": "5.2.21"
},
{
"__instance__": "sdcm.coredump.CoreDumpInfo",
Expand All @@ -45,7 +48,8 @@
"download_instructions": "failed to upload core",
"download_url": "",
"command_line": "/usr/bin/scylla --log-to-syslog 0 --log-to-stdout 1 --default-log-level info --",
"executable": ""
"executable": "/usr/bin/scylla",
"executable_version": "6.3.0"
}
],
"uploaded": [
Expand All @@ -58,7 +62,8 @@
"download_instructions": "gsutil cp gs://upload.scylladb.com/ac7d8023a369-41537-0-0-11-1600150672.core/ac7d8023a369-41537-0-0-11-1600150672.core.gz .\ngunzip ac7d8023a369-41537-0-0-11-1600150672.core.gz",
"download_url": "https://storage.cloud.google.com/upload.scylladb.com/ac7d8023a369-41537-0-0-11-1600150672.core/ac7d8023a369-41537-0-0-11-1600150672.core.gz",
"command_line": "/bin/bash /scylla-housekeeping-service.sh",
"executable": ""
"executable": "/usr/bin/bsh",
"executable_version": "5.2.21"
}
]
}
Original file line number Diff line number Diff line change
Expand Up @@ -241,5 +241,59 @@
"exited": 0,
"exit_status": 0
}
],
"sudo coredumpctl list 5711 -q --json=short" : [
{
"__instance__": "fabric.runners.Result",
"stdout": "[{\"time\":1733783727661621,\"pid\":609,\"uid\":998,\"gid\":998,\"sig\":11,\"corefile\":\"present\",\"exe\":\"/usr/bin/scylla\",\"size\":776650}]\n",
"stderr": "",
"exited": 0,
"exit_status": 0
}
],
"sudo coredumpctl list 41537 -q --json=short" : [
{
"__instance__": "fabric.runners.Result",
"stdout": "[{\"time\":1733783727661631,\"pid\":709,\"uid\":998,\"gid\":998,\"sig\":11,\"corefile\":\"present\",\"exe\":\"/usr/bin/bsh\",\"size\":776660}]\n",
"stderr": "",
"exited": 0,
"exit_status": 0
}
],
"sudo dpkg -S /usr/bin/scylla" : [
{
"__instance__": "fabric.runners.Result",
"stdout": "scylla-server: /usr/bin/scylla\n",
"stderr": "",
"exited": 0,
"exit_status": 0
}
],
"sudo dpkg -S /usr/bin/bsh" : [
{
"__instance__": "fabric.runners.Result",
"stdout": "bash: /usr/bin/bsh\n",
"stderr": "",
"exited": 0,
"exit_status": 0
}
],
"dpkg-query --showformat='${Version}' --show scylla-server" : [
{
"__instance__": "fabric.runners.Result",
"stdout": "6.3.0~dev-0.20241208.f744007e1365-1\n",
"stderr": "",
"exited": 0,
"exit_status": 0
}
],
"dpkg-query --showformat='${Version}' --show bash" : [
{
"__instance__": "fabric.runners.Result",
"stdout": "5.2.21-2ubuntu4\n",
"stderr": "",
"exited": 0,
"exit_status": 0
}
]
}
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,8 @@
"download_instructions": "gsutil cp gs://upload.scylladb.com/45d8a24d50d3-5711-0-0-6-1600105104.core/45d8a24d50d3-5711-0-0-6-1600105104.core.gz .\ngunzip 45d8a24d50d3-5711-0-0-6-1600105104.core.gz",
"download_url": "https://storage.cloud.google.com/upload.scylladb.com/45d8a24d50d3-5711-0-0-6-1600105104.core/45d8a24d50d3-5711-0-0-6-1600105104.core.gz",
"command_line": "/usr/bin/scylla --log-to-syslog 0 --log-to-stdout 1 --default-log-level info --",
"executable": ""
"executable": "/usr/bin/scylla",
"executable_version": "6.3.0"
},
{
"__instance__": "sdcm.coredump.CoreDumpInfo",
Expand All @@ -20,7 +21,8 @@
"download_instructions": "gsutil cp gs://upload.scylladb.com/ac7d8023a369-41537-0-0-11-1600150672.core/ac7d8023a369-41537-0-0-11-1600150672.core.gz .\ngunzip ac7d8023a369-41537-0-0-11-1600150672.core.gz",
"download_url": "https://storage.cloud.google.com/upload.scylladb.com/ac7d8023a369-41537-0-0-11-1600150672.core/ac7d8023a369-41537-0-0-11-1600150672.core.gz",
"command_line": "/bin/bash /scylla-housekeeping-service.sh",
"executable": ""
"executable": "/usr/bin/bsh",
"executable_version": "5.2.21"
}
],
"in_progress": [],
Expand All @@ -34,7 +36,8 @@
"download_instructions": "gsutil cp gs://upload.scylladb.com/ac7d8023a369-41537-0-0-11-1600150672.core/ac7d8023a369-41537-0-0-11-1600150672.core.gz .\ngunzip ac7d8023a369-41537-0-0-11-1600150672.core.gz",
"download_url": "https://storage.cloud.google.com/upload.scylladb.com/ac7d8023a369-41537-0-0-11-1600150672.core/ac7d8023a369-41537-0-0-11-1600150672.core.gz",
"command_line": "/bin/bash /scylla-housekeeping-service.sh",
"executable": ""
"executable": "/usr/bin/bsh",
"executable_version": "5.2.21"
},
{
"__instance__": "sdcm.coredump.CoreDumpInfo",
Expand All @@ -45,7 +48,8 @@
"download_instructions": "gsutil cp gs://upload.scylladb.com/45d8a24d50d3-5711-0-0-6-1600105104.core/45d8a24d50d3-5711-0-0-6-1600105104.core.gz .\ngunzip 45d8a24d50d3-5711-0-0-6-1600105104.core.gz",
"download_url": "https://storage.cloud.google.com/upload.scylladb.com/45d8a24d50d3-5711-0-0-6-1600105104.core/45d8a24d50d3-5711-0-0-6-1600105104.core.gz",
"command_line": "/usr/bin/scylla --log-to-syslog 0 --log-to-stdout 1 --default-log-level info --",
"executable": ""
"executable": "/usr/bin/scylla",
"executable_version": "6.3.0"
}
],
"uploaded": [
Expand All @@ -58,7 +62,8 @@
"download_instructions": "gsutil cp gs://upload.scylladb.com/ac7d8023a369-41537-0-0-11-1600150672.core/ac7d8023a369-41537-0-0-11-1600150672.core.gz .\ngunzip ac7d8023a369-41537-0-0-11-1600150672.core.gz",
"download_url": "https://storage.cloud.google.com/upload.scylladb.com/ac7d8023a369-41537-0-0-11-1600150672.core/ac7d8023a369-41537-0-0-11-1600150672.core.gz",
"command_line": "/bin/bash /scylla-housekeeping-service.sh",
"executable": ""
"executable": "/usr/bin/bsh",
"executable_version": "5.2.21"
},
{
"__instance__": "sdcm.coredump.CoreDumpInfo",
Expand All @@ -69,7 +74,8 @@
"download_instructions": "gsutil cp gs://upload.scylladb.com/45d8a24d50d3-5711-0-0-6-1600105104.core/45d8a24d50d3-5711-0-0-6-1600105104.core.gz .\ngunzip 45d8a24d50d3-5711-0-0-6-1600105104.core.gz",
"download_url": "https://storage.cloud.google.com/upload.scylladb.com/45d8a24d50d3-5711-0-0-6-1600105104.core/45d8a24d50d3-5711-0-0-6-1600105104.core.gz",
"command_line": "/usr/bin/scylla --log-to-syslog 0 --log-to-stdout 1 --default-log-level info --",
"executable": ""
"executable": "/usr/bin/scylla",
"executable_version": "6.3.0"
}
]
}
Loading