From 0353dd0e975b5e21feef701e5ae3791da32041e9 Mon Sep 17 00:00:00 2001 From: Peter Sabaini Date: Mon, 9 Oct 2023 14:45:59 +0200 Subject: [PATCH] Fixes for disk remove issues Fix regexp for stopping OSD services. Don't treat a failure to stop an OSD service as fatal, might be down already. Increase default timeout and retries, the default was overly optimistic. Test cleanup. Signed-off-by: Peter Sabaini --- microceph/ceph/osd.go | 13 +++++-------- microceph/ceph/osd_test.go | 5 ----- microceph/cmd/microceph/disk_remove.go | 2 +- 3 files changed, 6 insertions(+), 14 deletions(-) diff --git a/microceph/ceph/osd.go b/microceph/ceph/osd.go index 8e56e001..f557d715 100644 --- a/microceph/ceph/osd.go +++ b/microceph/ceph/osd.go @@ -709,12 +709,9 @@ func doRemoveOSD(ctx context.Context, s common.StateInterface, osd int64, bypass return err } } - // stop the OSD service + // stop the OSD service, but don't fail if it's not running if isPresent { - err = killOSD(osd) - } - if err != nil { - return err + _ = killOSD(osd) } // perform safety check for destroying if isPresent && !bypassSafety { @@ -775,7 +772,7 @@ func outDownOSD(osd int64) error { func safetyCheckStop(osd int64) error { var safeStop bool - retries := 12 + retries := 16 var backoff time.Duration for i := 0; i < retries; i++ { @@ -799,7 +796,7 @@ func safetyCheckStop(osd int64) error { func safetyCheckDestroy(osd int64) error { var safeDestroy bool - retries := 12 + retries := 16 var backoff time.Duration for i := 0; i < retries; i++ { @@ -878,7 +875,7 @@ func haveOSDInCeph(osd int64) (bool, error) { // killOSD terminates the osd process for an osd.id func killOSD(osd int64) error { - cmdline := fmt.Sprintf("ceph-osd .* --id %d", osd) + cmdline := fmt.Sprintf("ceph-osd .* --id %d$", osd) _, err := processExec.RunCommand("pkill", "-f", cmdline) if err != nil { logger.Errorf("Failed to kill osd.%d: %v", osd, err) diff --git a/microceph/ceph/osd_test.go b/microceph/ceph/osd_test.go index 423f34e4..3ea94a4f 100644 --- a/microceph/ceph/osd_test.go +++ b/microceph/ceph/osd_test.go @@ -27,11 +27,6 @@ func addCrushRuleLsExpectations(r *mocks.Runner) { r.On("RunCommand", cmdAny("ceph", 4)...).Return("microceph_auto_osd", nil).Once() } -// Expect: run ceph osd crush rule create-replicated -func addCrushRuleCreateExpectations(r *mocks.Runner) { - r.On("RunCommand", cmdAny("ceph", 7)...).Return("ok", nil).Once() -} - // Expect: run ceph osd crush rule dump func addCrushRuleDumpExpectations(r *mocks.Runner) { json := `{ "rule_id": 77 }` diff --git a/microceph/cmd/microceph/disk_remove.go b/microceph/cmd/microceph/disk_remove.go index a6fe5b19..c176e883 100644 --- a/microceph/cmd/microceph/disk_remove.go +++ b/microceph/cmd/microceph/disk_remove.go @@ -28,7 +28,7 @@ func (c *cmdDiskRemove) Command() *cobra.Command { RunE: c.Run, } - cmd.PersistentFlags().Int64Var(&c.flagTimeout, "timeout", 300, "Timeout to wait for safe removal (seconds), default=300") + cmd.PersistentFlags().Int64Var(&c.flagTimeout, "timeout", 1800, "Timeout to wait for safe removal (seconds), default=1800") cmd.PersistentFlags().BoolVar(&c.flagBypassSafety, "bypass-safety-checks", false, "Bypass safety checks") cmd.PersistentFlags().BoolVar(&c.flagConfirmDowngrade, "confirm-failure-domain-downgrade", false, "Confirm failure domain downgrade if required")