From 706a3b7f0d39c1c38b51f2f76c3a598523dee069 Mon Sep 17 00:00:00 2001 From: r-vasquez Date: Fri, 10 Jan 2025 12:28:17 -0800 Subject: [PATCH 1/2] rpk: save startup_log to debug bundle if present If not we just warn that we skipped the collection but the bundle will still be completed --- .../pkg/cli/debug/bundle/bundle_k8s_linux.go | 1 + .../rpk/pkg/cli/debug/bundle/bundle_linux.go | 26 +++++++++++++++++++ 2 files changed, 27 insertions(+) diff --git a/src/go/rpk/pkg/cli/debug/bundle/bundle_k8s_linux.go b/src/go/rpk/pkg/cli/debug/bundle/bundle_k8s_linux.go index b67d8a477a50c..cfb1aa410d230 100644 --- a/src/go/rpk/pkg/cli/debug/bundle/bundle_k8s_linux.go +++ b/src/go/rpk/pkg/cli/debug/bundle/bundle_k8s_linux.go @@ -79,6 +79,7 @@ func executeK8SBundle(ctx context.Context, bp bundleParams) error { saveMountedFilesystems(ps), saveNTPDrift(ps), saveResourceUsageData(ps, bp.y), + saveStartupLog(ps, bp.y), saveSlabInfo(ps), saveUname(ctx, ps), } diff --git a/src/go/rpk/pkg/cli/debug/bundle/bundle_linux.go b/src/go/rpk/pkg/cli/debug/bundle/bundle_linux.go index d108ede6e33f2..2ae0986683b58 100644 --- a/src/go/rpk/pkg/cli/debug/bundle/bundle_linux.go +++ b/src/go/rpk/pkg/cli/debug/bundle/bundle_linux.go @@ -152,6 +152,7 @@ func executeBundle(ctx context.Context, bp bundleParams) error { saveResourceUsageData(ps, bp.y), saveSingleAdminAPICalls(ctx, ps, bp.fs, bp.p, addrs, bp.cpuProfilerWait), saveMetricsAPICalls(ctx, ps, bp.fs, bp.p, addrs, bp.metricsInterval, bp.metricsSampleCount), + saveStartupLog(ps, bp.y), saveSlabInfo(ps), saveSocketData(ctx, ps), saveSysctl(ctx, ps), @@ -1021,6 +1022,31 @@ func saveControllerLogDir(ps *stepParams, y *config.RedpandaYaml, logLimitBytes } } +func saveStartupLog(ps *stepParams, y *config.RedpandaYaml) step { + return func() error { + if y.Redpanda.Directory == "" { + return fmt.Errorf("failed to save startup_log: 'redpanda.data_directory' is empty on the provided configuration file") + } + path := filepath.Join(y.Redpanda.Directory, "startup_log") + exists, err := afero.Exists(ps.fs, path) + if err != nil { + return fmt.Errorf("failed to save startup_log: unable to check existence of startup_log: %v", err) + } + if !exists { + return fmt.Errorf("skipping startup_log collection: unable to find file %q", path) + } + content, err := afero.ReadFile(ps.fs, path) + if err != nil { + return fmt.Errorf("failed to save startup_log: unable to read startup_log: %v", err) + } + err = writeFileToZip(ps, "startup_log", content) + if err != nil { + return fmt.Errorf("failed to save startup_log: %v", err) + } + return nil + } +} + func walkDir(root string, files map[string]*fileInfo) error { return filepath.WalkDir( root, From 4d54c8ad42f6c9e2def3e1fc6807d58e15fc0f06 Mon Sep 17 00:00:00 2001 From: r-vasquez Date: Fri, 10 Jan 2025 16:18:17 -0800 Subject: [PATCH 2/2] rpk: collect crash_reports in debug bundle Same as startup_log, we will only collect the directory if it's present --- src/go/rpk/pkg/cli/debug/bundle/bundle.go | 3 +++ .../pkg/cli/debug/bundle/bundle_k8s_linux.go | 1 + .../rpk/pkg/cli/debug/bundle/bundle_linux.go | 22 +++++++++++++++++++ tests/rptest/tests/rpk_debug_bundle_test.py | 7 ++++++ 4 files changed, 33 insertions(+) diff --git a/src/go/rpk/pkg/cli/debug/bundle/bundle.go b/src/go/rpk/pkg/cli/debug/bundle/bundle.go index 5338f1ca7f66c..9ab75d4704f17 100644 --- a/src/go/rpk/pkg/cli/debug/bundle/bundle.go +++ b/src/go/rpk/pkg/cli/debug/bundle/bundle.go @@ -249,6 +249,9 @@ COMMON FILES - Broker metrics: The broker's Prometheus metrics, fetched through its admin API (/metrics and /public_metrics). + - Crash information: Both startup_log and crash_reports will be collected if + present in the configured data directory. + BARE-METAL - Kernel: The kernel logs ring buffer (syslog) and parameters (sysctl). diff --git a/src/go/rpk/pkg/cli/debug/bundle/bundle_k8s_linux.go b/src/go/rpk/pkg/cli/debug/bundle/bundle_k8s_linux.go index cfb1aa410d230..e551d0c0a2400 100644 --- a/src/go/rpk/pkg/cli/debug/bundle/bundle_k8s_linux.go +++ b/src/go/rpk/pkg/cli/debug/bundle/bundle_k8s_linux.go @@ -70,6 +70,7 @@ func executeK8SBundle(ctx context.Context, bp bundleParams) error { saveCmdLine(ps), saveConfig(ps, bp.yActual), saveControllerLogDir(ps, bp.y, bp.controllerLogLimitBytes), + saveCrashReports(ps, bp.y), saveDataDirStructure(ps, bp.y), saveDiskUsage(ctx, ps, bp.y), saveInterrupts(ps), diff --git a/src/go/rpk/pkg/cli/debug/bundle/bundle_linux.go b/src/go/rpk/pkg/cli/debug/bundle/bundle_linux.go index 2ae0986683b58..c72d15020dab6 100644 --- a/src/go/rpk/pkg/cli/debug/bundle/bundle_linux.go +++ b/src/go/rpk/pkg/cli/debug/bundle/bundle_linux.go @@ -135,6 +135,7 @@ func executeBundle(ctx context.Context, bp bundleParams) error { saveCmdLine(ps), saveConfig(ps, bp.y), saveControllerLogDir(ps, bp.y, bp.controllerLogLimitBytes), + saveCrashReports(ps, bp.y), saveDNSData(ctx, ps), saveDataDirStructure(ps, bp.y), saveDiskUsage(ctx, ps, bp.y), @@ -1047,6 +1048,27 @@ func saveStartupLog(ps *stepParams, y *config.RedpandaYaml) step { } } +func saveCrashReports(ps *stepParams, y *config.RedpandaYaml) step { + return func() error { + if y.Redpanda.Directory == "" { + return fmt.Errorf("failed to save crash_reports: 'redpanda.data_directory' is empty on the provided configuration file") + } + crashReportDir := filepath.Join(y.Redpanda.Directory, "crash_reports") + exists, err := afero.Exists(ps.fs, crashReportDir) + if err != nil { + return fmt.Errorf("failed to save crash_reports: unable to check existence of the crash_reports directory") + } + if !exists { + return fmt.Errorf("skipping crash_reports collection: directory %q does not exists", crashReportDir) + } + err = writeDirToZip(ps, crashReportDir, "crash_reports", nil) + if err != nil { + return fmt.Errorf("failed to save crash_reports: %v", err) + } + return nil + } +} + func walkDir(root string, files map[string]*fileInfo) error { return filepath.WalkDir( root, diff --git a/tests/rptest/tests/rpk_debug_bundle_test.py b/tests/rptest/tests/rpk_debug_bundle_test.py index f323211484a9f..82f1481777076 100644 --- a/tests/rptest/tests/rpk_debug_bundle_test.py +++ b/tests/rptest/tests/rpk_debug_bundle_test.py @@ -70,6 +70,13 @@ def test_debug_bundle(self): continue if re.match(r".* error querying .*\.ntp\..* i\/o timeout", l): self.logger.error(f"Non-fatal transitory NTP error: {l}") + if re.match( + r".*skipping\s+(startup_log collection|crash_reports collection):\s*(unable to find file|directory).*", + l): + # this tests runs a development container, it will not have a + # startup_log and we don't expect a crash_reports dir to be + # in the data_directory as the container is new. + continue else: self.logger.error(f"Bad output line: {l}") filtered_errors.append(l)