From 6c7d6ea79944b333e299c613ff23f4d44adda06d Mon Sep 17 00:00:00 2001
From: zoetrope <a.ikezoe@gmail.com>
Date: Thu, 8 Jun 2023 14:14:53 +0900
Subject: [PATCH 1/4] Retry eviction of pod

Signed-off-by: zoetrope <a.ikezoe@gmail.com>
---
 op/reboot.go        | 24 ++++++++++++++++--------
 op/reboot_decide.go | 23 +++++++++++++++++++++--
 pkg/cke/main.go     | 18 +++++++++++++++++-
 server/config.go    | 17 +++++++++++++++++
 server/control.go   | 19 ++++++++-----------
 server/strategy.go  | 13 +++++++------
 6 files changed, 86 insertions(+), 28 deletions(-)
 create mode 100644 server/config.go

diff --git a/op/reboot.go b/op/reboot.go
index 92abc3d4a..02e0a99eb 100644
--- a/op/reboot.go
+++ b/op/reboot.go
@@ -22,19 +22,23 @@ const drainBackOffBaseSeconds = 60
 type rebootDrainStartOp struct {
 	finished bool
 
-	entries   []*cke.RebootQueueEntry
-	config    *cke.Reboot
-	apiserver *cke.Node
+	entries       []*cke.RebootQueueEntry
+	config        *cke.Reboot
+	apiserver     *cke.Node
+	retryTimes    int
+	retryInterval time.Duration
 
 	mu          sync.Mutex
 	failedNodes []string
 }
 
-func RebootDrainStartOp(apiserver *cke.Node, entries []*cke.RebootQueueEntry, config *cke.Reboot) cke.InfoOperator {
+func RebootDrainStartOp(apiserver *cke.Node, entries []*cke.RebootQueueEntry, config *cke.Reboot, retryTimes int, retryInterval time.Duration) cke.InfoOperator {
 	return &rebootDrainStartOp{
-		entries:   entries,
-		config:    config,
-		apiserver: apiserver,
+		entries:       entries,
+		config:        config,
+		apiserver:     apiserver,
+		retryTimes:    retryTimes,
+		retryInterval: retryInterval,
 	}
 }
 
@@ -42,6 +46,8 @@ type rebootDrainStartCommand struct {
 	entries             []*cke.RebootQueueEntry
 	protectedNamespaces *metav1.LabelSelector
 	apiserver           *cke.Node
+	retryTimes          int
+	retryInterval       time.Duration
 
 	notifyFailedNode func(string)
 }
@@ -82,6 +88,8 @@ func (o *rebootDrainStartOp) NextCommand() cke.Commander {
 		protectedNamespaces: o.config.ProtectedNamespaces,
 		apiserver:           o.apiserver,
 		notifyFailedNode:    o.notifyFailedNode,
+		retryTimes:          o.retryTimes,
+		retryInterval:       o.retryInterval,
 	}
 }
 
@@ -156,7 +164,7 @@ func (c rebootDrainStartCommand) Run(ctx context.Context, inf cke.Infrastructure
 
 	// next, evict pods on each node
 	for _, entry := range evictNodes {
-		err := evictOrDeleteNodePod(ctx, cs, entry.Node, protected)
+		err := evictOrDeleteNodePod(ctx, cs, entry.Node, protected, c.retryTimes, c.retryInterval)
 		if err != nil {
 			c.notifyFailedNode(entry.Node)
 			err = drainBackOff(ctx, inf, entry, err)
diff --git a/op/reboot_decide.go b/op/reboot_decide.go
index ecb53f483..9a7b0a902 100644
--- a/op/reboot_decide.go
+++ b/op/reboot_decide.go
@@ -74,14 +74,21 @@ func checkJobPodNotExist(ctx context.Context, cs *kubernetes.Clientset, node str
 // evictOrDeleteNodePod evicts or delete Pods on the specified Node.
 // It first tries eviction. If the eviction failed and the Pod's namespace is not protected, it deletes the Pod.
 // If a running Job Pod exists, this function returns an error.
-func evictOrDeleteNodePod(ctx context.Context, cs *kubernetes.Clientset, node string, protected map[string]bool) error {
+func evictOrDeleteNodePod(ctx context.Context, cs *kubernetes.Clientset, node string, protected map[string]bool, retryTimes int, retryInterval time.Duration) error {
 	return enumeratePods(ctx, cs, node, func(pod *corev1.Pod) error {
+		retryCount := 0
+	EVICT:
+		log.Info("start evicting pod", map[string]interface{}{
+			"namespace": pod.Namespace,
+			"name":      pod.Name,
+		})
 		err := cs.CoreV1().Pods(pod.Namespace).EvictV1(ctx, &policyv1.Eviction{
 			ObjectMeta: metav1.ObjectMeta{Name: pod.Name, Namespace: pod.Namespace},
 		})
+		retryCount++
 		switch {
 		case err == nil:
-			log.Info("start evicting pod", map[string]interface{}{
+			log.Info("evicted pod", map[string]interface{}{
 				"namespace": pod.Namespace,
 				"name":      pod.Name,
 			})
@@ -105,6 +112,18 @@ func evictOrDeleteNodePod(ctx context.Context, cs *kubernetes.Clientset, node st
 				"name":      pod.Name,
 			})
 		default:
+			if retryCount < retryTimes {
+				select {
+				case <-ctx.Done():
+					return ctx.Err()
+				case <-time.After(retryInterval):
+				}
+				log.Info("retry eviction of pod", map[string]interface{}{
+					"namespace": pod.Namespace,
+					"name":      pod.Name,
+				})
+				goto EVICT
+			}
 			return fmt.Errorf("failed to evict pod %s/%s due to PDB: %w", pod.Namespace, pod.Name, err)
 		}
 		return nil
diff --git a/pkg/cke/main.go b/pkg/cke/main.go
index df322eb37..2aeaa0fe9 100644
--- a/pkg/cke/main.go
+++ b/pkg/cke/main.go
@@ -27,6 +27,8 @@ var (
 	flgSessionTTL           = pflag.String("session-ttl", "60s", "leader session's TTL")
 	flgDebugSabakan         = pflag.Bool("debug-sabakan", false, "debug sabakan integration")
 	flgMaxConcurrentUpdates = pflag.Int("max-concurrent-updates", 10, "the maximum number of components that can be updated simultaneously")
+	flgDrainRetryTimes      = pflag.Int("drain-retry-times", 30, "number of drain retries")
+	flgDrainRetryInterval   = pflag.String("drain-retry-interval", "10s", "interval between drain retries")
 )
 
 func loadConfig(p string) (*etcdutil.Config, error) {
@@ -114,9 +116,23 @@ func main() {
 	if maxConcurrentUpdates <= 0 {
 		log.ErrorExit(errors.New("max-concurrent-updates must be greater than 0"))
 	}
+	retryTimes := *flgDrainRetryTimes
+	if retryTimes <= 0 {
+		log.ErrorExit(errors.New("drain-retry-times must be greater than 0"))
+	}
+	retryInterval, err := time.ParseDuration(*flgDrainRetryInterval)
+	if err != nil {
+		log.ErrorExit(err)
+	}
 
 	// Controller
-	controller := server.NewController(session, interval, gcInterval, timeout, addon, maxConcurrentUpdates)
+	controller := server.NewController(session, addon, &server.Config{
+		Interval:             interval,
+		CertsGCInterval:      gcInterval,
+		MaxConcurrentUpdates: maxConcurrentUpdates,
+		DrainRetryTimes:      retryTimes,
+		DrainRetryInterval:   retryInterval,
+	})
 	well.Go(controller.Run)
 
 	// API server
diff --git a/server/config.go b/server/config.go
new file mode 100644
index 000000000..1625865c1
--- /dev/null
+++ b/server/config.go
@@ -0,0 +1,17 @@
+package server
+
+import "time"
+
+// Config is the configuration for cke-server.
+type Config struct {
+	// Interval is the interval of the main loop.
+	Interval time.Duration
+	// CertsGCInterval is the interval of the certificate garbage collection.
+	CertsGCInterval time.Duration
+	// MaxConcurrentUpdates is the maximum number of concurrent updates.
+	MaxConcurrentUpdates int
+	// DrainRetryTimes is the number of retries for drain.
+	DrainRetryTimes int
+	// DrainRetryInterval is the interval of retries for drain.
+	DrainRetryInterval time.Duration
+}
diff --git a/server/control.go b/server/control.go
index bcd34eec7..4507ea2ef 100644
--- a/server/control.go
+++ b/server/control.go
@@ -23,17 +23,14 @@ var (
 
 // Controller manage operations
 type Controller struct {
-	session              *concurrency.Session
-	interval             time.Duration
-	certsGCInterval      time.Duration
-	timeout              time.Duration
-	addon                Integrator
-	maxConcurrentUpdates int
+	session *concurrency.Session
+	addon   Integrator
+	config  *Config
 }
 
 // NewController construct controller instance
-func NewController(s *concurrency.Session, interval, gcInterval, timeout time.Duration, addon Integrator, maxConcurrentUpdates int) Controller {
-	return Controller{s, interval, gcInterval, timeout, addon, maxConcurrentUpdates}
+func NewController(s *concurrency.Session, addon Integrator, config *Config) Controller {
+	return Controller{s, addon, config}
 }
 
 // Run execute procedures with leader elections
@@ -149,7 +146,7 @@ func (c Controller) runLoop(ctx context.Context, leaderKey string) error {
 		case <-ctx.Done():
 			return ctx.Err()
 		}
-		ticker := time.NewTicker(c.interval)
+		ticker := time.NewTicker(c.config.Interval)
 		defer ticker.Stop()
 		for {
 			select {
@@ -170,7 +167,7 @@ func (c Controller) runLoop(ctx context.Context, leaderKey string) error {
 			return ctx.Err()
 		default:
 		}
-		ticker := time.NewTicker(c.certsGCInterval)
+		ticker := time.NewTicker(c.config.CertsGCInterval)
 		defer ticker.Stop()
 		for {
 			select {
@@ -346,7 +343,7 @@ func (c Controller) runOnce(ctx context.Context, leaderKey string, tick <-chan t
 		DrainCompleted: drainCompleted,
 		DrainTimedout:  drainTimedout,
 		RebootDequeued: rebootDequeued,
-	}, c.maxConcurrentUpdates)
+	}, c.config)
 
 	st := &cke.ServerStatus{
 		Phase:     phase,
diff --git a/server/strategy.go b/server/strategy.go
index 852ba4748..2a7b42062 100644
--- a/server/strategy.go
+++ b/server/strategy.go
@@ -12,6 +12,7 @@ import (
 	corev1 "k8s.io/api/core/v1"
 	discoveryv1 "k8s.io/api/discovery/v1"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+	"time"
 )
 
 type DecideOpsRebootArgs struct {
@@ -24,7 +25,7 @@ type DecideOpsRebootArgs struct {
 
 // DecideOps returns the next operations to do and the operation phase.
 // This returns nil when no operations need to be done.
-func DecideOps(c *cke.Cluster, cs *cke.ClusterStatus, constraints *cke.Constraints, resources []cke.ResourceDefinition, rebootArgs DecideOpsRebootArgs, maxConcurrentUpdates int) ([]cke.Operator, cke.OperationPhase) {
+func DecideOps(c *cke.Cluster, cs *cke.ClusterStatus, constraints *cke.Constraints, resources []cke.ResourceDefinition, rebootArgs DecideOpsRebootArgs, config *Config) ([]cke.Operator, cke.OperationPhase) {
 	nf := NewNodeFilter(c, cs)
 
 	// 0. Execute upgrade operation if necessary
@@ -40,7 +41,7 @@ func DecideOps(c *cke.Cluster, cs *cke.ClusterStatus, constraints *cke.Constrain
 	// 1. Run or restart rivers.  This guarantees:
 	// - CKE tools image is pulled on all nodes.
 	// - Rivers runs on all nodes and will proxy requests only to control plane nodes.
-	if ops := riversOps(c, nf, maxConcurrentUpdates); len(ops) > 0 {
+	if ops := riversOps(c, nf, config.MaxConcurrentUpdates); len(ops) > 0 {
 		return ops, cke.PhaseRivers
 	}
 
@@ -65,7 +66,7 @@ func DecideOps(c *cke.Cluster, cs *cke.ClusterStatus, constraints *cke.Constrain
 	}
 
 	// 5. Run or restart kubernetes components.
-	if ops := k8sOps(c, nf, cs, maxConcurrentUpdates); len(ops) > 0 {
+	if ops := k8sOps(c, nf, cs, config.MaxConcurrentUpdates); len(ops) > 0 {
 		return ops, cke.PhaseK8sStart
 	}
 
@@ -92,7 +93,7 @@ func DecideOps(c *cke.Cluster, cs *cke.ClusterStatus, constraints *cke.Constrain
 	}
 
 	// 10. Reboot nodes if reboot request has been arrived to the reboot queue, and the number of unreachable nodes is less than a threshold.
-	if ops, phaseReboot := rebootOps(c, constraints, rebootArgs, nf); phaseReboot {
+	if ops, phaseReboot := rebootOps(c, constraints, rebootArgs, nf, config.DrainRetryTimes, config.DrainRetryInterval); phaseReboot {
 		if !nf.EtcdIsGood() {
 			log.Warn("cannot reboot nodes because etcd cluster is not responding and in-sync", nil)
 			return nil, cke.PhaseRebootNodes
@@ -719,7 +720,7 @@ func cleanOps(c *cke.Cluster, nf *NodeFilter) (ops []cke.Operator) {
 	return ops
 }
 
-func rebootOps(c *cke.Cluster, constraints *cke.Constraints, rebootArgs DecideOpsRebootArgs, nf *NodeFilter) (ops []cke.Operator, phaseReboot bool) {
+func rebootOps(c *cke.Cluster, constraints *cke.Constraints, rebootArgs DecideOpsRebootArgs, nf *NodeFilter, drainRetryTimes int, drainRetryInterval time.Duration) (ops []cke.Operator, phaseReboot bool) {
 	if len(rebootArgs.RQEntries) == 0 {
 		return nil, false
 	}
@@ -743,7 +744,7 @@ func rebootOps(c *cke.Cluster, constraints *cke.Constraints, rebootArgs DecideOp
 		if len(nf.SSHNotConnectedNodes(sshCheckNodes, true, true)) > constraints.RebootMaximumUnreachable {
 			log.Warn("cannot reboot nodes because too many nodes are unreachable", nil)
 		} else {
-			ops = append(ops, op.RebootDrainStartOp(nf.HealthyAPIServer(), rebootArgs.NewlyDrained, &c.Reboot))
+			ops = append(ops, op.RebootDrainStartOp(nf.HealthyAPIServer(), rebootArgs.NewlyDrained, &c.Reboot, drainRetryTimes, drainRetryInterval))
 		}
 	}
 	if len(rebootArgs.DrainCompleted) > 0 {

From d7cd71610f0b1f5d110107f778c5f5e15f76f312 Mon Sep 17 00:00:00 2001
From: zoetrope <a.ikezoe@gmail.com>
Date: Thu, 8 Jun 2023 14:14:53 +0900
Subject: [PATCH 2/4] Retry eviction of pod

Signed-off-by: zoetrope <a.ikezoe@gmail.com>
---
 mtest/run_test.go       | 10 ++++++++--
 server/strategy_test.go |  8 +++++++-
 2 files changed, 15 insertions(+), 3 deletions(-)

diff --git a/mtest/run_test.go b/mtest/run_test.go
index efe3d3725..f6fb328dd 100644
--- a/mtest/run_test.go
+++ b/mtest/run_test.go
@@ -219,7 +219,7 @@ func runCKE(image string) error {
 				"-e GOFAIL_HTTP=0.0.0.0:1234 " +
 				"--mount type=bind,source=/var/lib/cke,target=/var/lib/cke " +
 				"--mount type=bind,source=/etc/cke/,target=/etc/cke/ " +
-				image + " --config /etc/cke/cke.yml --interval 3s --certs-gc-interval 5m --session-ttl 5s --loglevel debug")
+				image + " --config /etc/cke/cke.yml --interval 3s --certs-gc-interval 5m --session-ttl 5s --loglevel debug --drain-retry-times 3 --drain-retry-interval 1s")
 		})
 	}
 	env.Stop()
@@ -341,7 +341,13 @@ func connectEtcd() (*clientv3.Client, error) {
 }
 
 func getClusterStatus(cluster *cke.Cluster) (*cke.ClusterStatus, []cke.ResourceDefinition, error) {
-	controller := server.NewController(nil, 0, time.Hour, time.Second*2, nil, 10)
+	controller := server.NewController(nil, nil, &server.Config{
+		Interval:             0,
+		CertsGCInterval:      time.Hour,
+		MaxConcurrentUpdates: 10,
+		DrainRetryTimes:      10,
+		DrainRetryInterval:   3 * time.Second,
+	})
 
 	etcd, err := connectEtcd()
 	if err != nil {
diff --git a/server/strategy_test.go b/server/strategy_test.go
index 602d7f175..6904ed274 100644
--- a/server/strategy_test.go
+++ b/server/strategy_test.go
@@ -2496,7 +2496,13 @@ func TestDecideOps(t *testing.T) {
 
 	for _, c := range cases {
 		t.Run(c.Name, func(t *testing.T) {
-			ops, _ := DecideOps(c.Input.Cluster, c.Input.Status, c.Input.Constraints, c.Input.Resources, c.Input.RebootArgs, 5)
+			ops, _ := DecideOps(c.Input.Cluster, c.Input.Status, c.Input.Constraints, c.Input.Resources, c.Input.RebootArgs, &Config{
+				Interval:             0,
+				CertsGCInterval:      0,
+				MaxConcurrentUpdates: 5,
+				DrainRetryTimes:      3,
+				DrainRetryInterval:   10 * time.Second,
+			})
 			if len(ops) == 0 && len(c.ExpectedOps) == 0 {
 				return
 			}

From 3992d8fe7bb58a14fb7c53c209cc5e8aa07c3685 Mon Sep 17 00:00:00 2001
From: zoetrope <a.ikezoe@gmail.com>
Date: Thu, 10 Aug 2023 17:54:46 +0900
Subject: [PATCH 3/4] Moved parameters to cluster.yaml

---
 .gitignore              |  1 +
 cluster.go              |  8 ++++++++
 cluster_test.go         | 12 ++++++++++++
 docs/cluster.md         |  5 ++++-
 mtest/run_test.go       |  4 +---
 op/reboot.go            | 37 +++++++++++++++++++++----------------
 op/reboot_decide.go     | 10 +++++-----
 pkg/cke/main.go         | 12 ------------
 server/config.go        |  4 ----
 server/strategy.go      |  7 +++----
 server/strategy_test.go |  2 --
 testdata/cluster.yaml   |  2 ++
 12 files changed, 57 insertions(+), 47 deletions(-)

diff --git a/.gitignore b/.gitignore
index d5db6b6a5..da69d4330 100644
--- a/.gitignore
+++ b/.gitignore
@@ -6,6 +6,7 @@
 
 # Project-local glide cache, RE: https://github.com/Masterminds/glide/issues/736
 .glide/
+vendor/
 
 # Editors
 *~
diff --git a/cluster.go b/cluster.go
index 2b4579bb7..410c926bd 100644
--- a/cluster.go
+++ b/cluster.go
@@ -277,6 +277,8 @@ type Reboot struct {
 	CommandTimeoutSeconds  *int                  `json:"command_timeout_seconds,omitempty"`
 	CommandRetries         *int                  `json:"command_retries"`
 	CommandInterval        *int                  `json:"command_interval"`
+	EvictRetries           *int                  `json:"evict_retries"`
+	EvictInterval          *int                  `json:"evict_interval"`
 	ProtectedNamespaces    *metav1.LabelSelector `json:"protected_namespaces,omitempty"`
 }
 
@@ -495,6 +497,12 @@ func validateReboot(reboot Reboot) error {
 	if reboot.CommandInterval != nil && *reboot.CommandInterval < 0 {
 		return errors.New("command_interval must not be negative")
 	}
+	if reboot.EvictRetries != nil && *reboot.EvictRetries < 0 {
+		return errors.New("drain_retries must not be negative")
+	}
+	if reboot.EvictInterval != nil && *reboot.EvictInterval < 0 {
+		return errors.New("drain_interval must not be negative")
+	}
 	if reboot.MaxConcurrentReboots != nil && *reboot.MaxConcurrentReboots <= 0 {
 		return errors.New("max_concurrent_reboots must be positive")
 	}
diff --git a/cluster_test.go b/cluster_test.go
index 4eb82f987..185db19b8 100644
--- a/cluster_test.go
+++ b/cluster_test.go
@@ -110,6 +110,18 @@ func testClusterYAML(t *testing.T) {
 	if *c.Reboot.CommandInterval != 30 {
 		t.Error(`*c.Reboot.CommandInterval != 30`)
 	}
+	if c.Reboot.EvictRetries == nil {
+		t.Fatal(`c.Reboot.EvictRetries == nil`)
+	}
+	if *c.Reboot.EvictRetries != 10 {
+		t.Error(`*c.Reboot.EvictRetries != 10`)
+	}
+	if c.Reboot.EvictInterval == nil {
+		t.Fatal(`c.Reboot.EvictInterval == nil`)
+	}
+	if *c.Reboot.EvictInterval != 3 {
+		t.Error(`*c.Reboot.EvictInterval != 3`)
+	}
 	if c.Reboot.ProtectedNamespaces == nil {
 		t.Fatal(`c.Reboot.ProtectedNamespaces == nil`)
 	}
diff --git a/docs/cluster.md b/docs/cluster.md
index c60e01a16..dc1caa639 100644
--- a/docs/cluster.md
+++ b/docs/cluster.md
@@ -68,13 +68,15 @@ Reboot
 ------
 
 | Name                       | Required | Type                             | Description                                                             |
-| -------------------------- | -------- | -------------------------------- | ----------------------------------------------------------------------- |
+|----------------------------| -------- | -------------------------------- |-------------------------------------------------------------------------|
 | `reboot_command`           | true     | array                            | A command to reboot.  List of strings.                                  |
 | `boot_check_command`       | true     | array                            | A command to check nodes booted.  List of strings.                      |
 | `eviction_timeout_seconds` | false    | *int                             | Deadline for eviction. Must be positive. Default: 600 (10 minutes).     |
 | `command_timeout_seconds`  | false    | *int                             | Deadline for rebooting. Zero means infinity. Default: wait indefinitely |
 | `command_retries`          | false    | *int                             | Number of reboot retries, not including initial attempt. Default: 0     |
 | `command_interval`         | false    | *int                             | Interval of time between reboot retries in seconds. Default: 0          |
+| `evict_retries`            | false    | *int                             | Number of eviction retries, not including initial attempt. Default: 0   |
+| `evict_interval`           | false    | *int                             | Interval of time between eviction retries in seconds. Default: 0        |
 | `max_concurrent_reboots`   | false    | *int                             | Maximum number of nodes to be rebooted concurrently. Default: 1         |
 | `protected_namespaces`     | false    | [`LabelSelector`][LabelSelector] | A label selector to protect namespaces.                                 |
 
@@ -89,6 +91,7 @@ If the node is not booted yet, this command should output `false` to stdout and
 If `command_timeout_seconds` is specified, the check command should return within `command_timeout_seconds` seconds, or it is considered failed.
 
 CKE tries to delete Pods in the `protected_namespaces` gracefully with the Kubernetes eviction API.
+If the eviction API has failed, CKE retries it for `evict_retries` times with `evict_interval`-second interval.
 If any of the Pods cannot be deleted, it aborts the operation.
 
 The Pods in the non-protected namespaces are also tried to be deleted gracefully with the Kubernetes eviction API, but they would be simply deleted if eviction is denied.
diff --git a/mtest/run_test.go b/mtest/run_test.go
index f6fb328dd..3e9b9a01b 100644
--- a/mtest/run_test.go
+++ b/mtest/run_test.go
@@ -219,7 +219,7 @@ func runCKE(image string) error {
 				"-e GOFAIL_HTTP=0.0.0.0:1234 " +
 				"--mount type=bind,source=/var/lib/cke,target=/var/lib/cke " +
 				"--mount type=bind,source=/etc/cke/,target=/etc/cke/ " +
-				image + " --config /etc/cke/cke.yml --interval 3s --certs-gc-interval 5m --session-ttl 5s --loglevel debug --drain-retry-times 3 --drain-retry-interval 1s")
+				image + " --config /etc/cke/cke.yml --interval 3s --certs-gc-interval 5m --session-ttl 5s --loglevel debug")
 		})
 	}
 	env.Stop()
@@ -345,8 +345,6 @@ func getClusterStatus(cluster *cke.Cluster) (*cke.ClusterStatus, []cke.ResourceD
 		Interval:             0,
 		CertsGCInterval:      time.Hour,
 		MaxConcurrentUpdates: 10,
-		DrainRetryTimes:      10,
-		DrainRetryInterval:   3 * time.Second,
 	})
 
 	etcd, err := connectEtcd()
diff --git a/op/reboot.go b/op/reboot.go
index 02e0a99eb..2a9533072 100644
--- a/op/reboot.go
+++ b/op/reboot.go
@@ -22,23 +22,19 @@ const drainBackOffBaseSeconds = 60
 type rebootDrainStartOp struct {
 	finished bool
 
-	entries       []*cke.RebootQueueEntry
-	config        *cke.Reboot
-	apiserver     *cke.Node
-	retryTimes    int
-	retryInterval time.Duration
+	entries   []*cke.RebootQueueEntry
+	config    *cke.Reboot
+	apiserver *cke.Node
 
 	mu          sync.Mutex
 	failedNodes []string
 }
 
-func RebootDrainStartOp(apiserver *cke.Node, entries []*cke.RebootQueueEntry, config *cke.Reboot, retryTimes int, retryInterval time.Duration) cke.InfoOperator {
+func RebootDrainStartOp(apiserver *cke.Node, entries []*cke.RebootQueueEntry, config *cke.Reboot) cke.InfoOperator {
 	return &rebootDrainStartOp{
-		entries:       entries,
-		config:        config,
-		apiserver:     apiserver,
-		retryTimes:    retryTimes,
-		retryInterval: retryInterval,
+		entries:   entries,
+		config:    config,
+		apiserver: apiserver,
 	}
 }
 
@@ -46,8 +42,8 @@ type rebootDrainStartCommand struct {
 	entries             []*cke.RebootQueueEntry
 	protectedNamespaces *metav1.LabelSelector
 	apiserver           *cke.Node
-	retryTimes          int
-	retryInterval       time.Duration
+	evictAttempts       int
+	evictInterval       time.Duration
 
 	notifyFailedNode func(string)
 }
@@ -83,13 +79,22 @@ func (o *rebootDrainStartOp) NextCommand() cke.Commander {
 	}
 	o.finished = true
 
+	attempts := 1
+	if o.config.EvictRetries != nil {
+		attempts = *o.config.EvictRetries + 1
+	}
+	interval := 0 * time.Second
+	if o.config.EvictInterval != nil {
+		interval = time.Second * time.Duration(*o.config.EvictInterval)
+	}
+
 	return rebootDrainStartCommand{
 		entries:             o.entries,
 		protectedNamespaces: o.config.ProtectedNamespaces,
 		apiserver:           o.apiserver,
 		notifyFailedNode:    o.notifyFailedNode,
-		retryTimes:          o.retryTimes,
-		retryInterval:       o.retryInterval,
+		evictAttempts:       attempts,
+		evictInterval:       interval,
 	}
 }
 
@@ -164,7 +169,7 @@ func (c rebootDrainStartCommand) Run(ctx context.Context, inf cke.Infrastructure
 
 	// next, evict pods on each node
 	for _, entry := range evictNodes {
-		err := evictOrDeleteNodePod(ctx, cs, entry.Node, protected, c.retryTimes, c.retryInterval)
+		err := evictOrDeleteNodePod(ctx, cs, entry.Node, protected, c.evictAttempts, c.evictInterval)
 		if err != nil {
 			c.notifyFailedNode(entry.Node)
 			err = drainBackOff(ctx, inf, entry, err)
diff --git a/op/reboot_decide.go b/op/reboot_decide.go
index 9a7b0a902..14e0bb7a1 100644
--- a/op/reboot_decide.go
+++ b/op/reboot_decide.go
@@ -74,9 +74,9 @@ func checkJobPodNotExist(ctx context.Context, cs *kubernetes.Clientset, node str
 // evictOrDeleteNodePod evicts or delete Pods on the specified Node.
 // It first tries eviction. If the eviction failed and the Pod's namespace is not protected, it deletes the Pod.
 // If a running Job Pod exists, this function returns an error.
-func evictOrDeleteNodePod(ctx context.Context, cs *kubernetes.Clientset, node string, protected map[string]bool, retryTimes int, retryInterval time.Duration) error {
+func evictOrDeleteNodePod(ctx context.Context, cs *kubernetes.Clientset, node string, protected map[string]bool, attempts int, interval time.Duration) error {
 	return enumeratePods(ctx, cs, node, func(pod *corev1.Pod) error {
-		retryCount := 0
+		evictCount := 0
 	EVICT:
 		log.Info("start evicting pod", map[string]interface{}{
 			"namespace": pod.Namespace,
@@ -85,7 +85,7 @@ func evictOrDeleteNodePod(ctx context.Context, cs *kubernetes.Clientset, node st
 		err := cs.CoreV1().Pods(pod.Namespace).EvictV1(ctx, &policyv1.Eviction{
 			ObjectMeta: metav1.ObjectMeta{Name: pod.Name, Namespace: pod.Namespace},
 		})
-		retryCount++
+		evictCount++
 		switch {
 		case err == nil:
 			log.Info("evicted pod", map[string]interface{}{
@@ -112,11 +112,11 @@ func evictOrDeleteNodePod(ctx context.Context, cs *kubernetes.Clientset, node st
 				"name":      pod.Name,
 			})
 		default:
-			if retryCount < retryTimes {
+			if evictCount < attempts {
 				select {
 				case <-ctx.Done():
 					return ctx.Err()
-				case <-time.After(retryInterval):
+				case <-time.After(interval):
 				}
 				log.Info("retry eviction of pod", map[string]interface{}{
 					"namespace": pod.Namespace,
diff --git a/pkg/cke/main.go b/pkg/cke/main.go
index 2aeaa0fe9..a572de4d1 100644
--- a/pkg/cke/main.go
+++ b/pkg/cke/main.go
@@ -27,8 +27,6 @@ var (
 	flgSessionTTL           = pflag.String("session-ttl", "60s", "leader session's TTL")
 	flgDebugSabakan         = pflag.Bool("debug-sabakan", false, "debug sabakan integration")
 	flgMaxConcurrentUpdates = pflag.Int("max-concurrent-updates", 10, "the maximum number of components that can be updated simultaneously")
-	flgDrainRetryTimes      = pflag.Int("drain-retry-times", 30, "number of drain retries")
-	flgDrainRetryInterval   = pflag.String("drain-retry-interval", "10s", "interval between drain retries")
 )
 
 func loadConfig(p string) (*etcdutil.Config, error) {
@@ -116,22 +114,12 @@ func main() {
 	if maxConcurrentUpdates <= 0 {
 		log.ErrorExit(errors.New("max-concurrent-updates must be greater than 0"))
 	}
-	retryTimes := *flgDrainRetryTimes
-	if retryTimes <= 0 {
-		log.ErrorExit(errors.New("drain-retry-times must be greater than 0"))
-	}
-	retryInterval, err := time.ParseDuration(*flgDrainRetryInterval)
-	if err != nil {
-		log.ErrorExit(err)
-	}
 
 	// Controller
 	controller := server.NewController(session, addon, &server.Config{
 		Interval:             interval,
 		CertsGCInterval:      gcInterval,
 		MaxConcurrentUpdates: maxConcurrentUpdates,
-		DrainRetryTimes:      retryTimes,
-		DrainRetryInterval:   retryInterval,
 	})
 	well.Go(controller.Run)
 
diff --git a/server/config.go b/server/config.go
index 1625865c1..dab7dbc9e 100644
--- a/server/config.go
+++ b/server/config.go
@@ -10,8 +10,4 @@ type Config struct {
 	CertsGCInterval time.Duration
 	// MaxConcurrentUpdates is the maximum number of concurrent updates.
 	MaxConcurrentUpdates int
-	// DrainRetryTimes is the number of retries for drain.
-	DrainRetryTimes int
-	// DrainRetryInterval is the interval of retries for drain.
-	DrainRetryInterval time.Duration
 }
diff --git a/server/strategy.go b/server/strategy.go
index 2a7b42062..2aeb3d747 100644
--- a/server/strategy.go
+++ b/server/strategy.go
@@ -12,7 +12,6 @@ import (
 	corev1 "k8s.io/api/core/v1"
 	discoveryv1 "k8s.io/api/discovery/v1"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
-	"time"
 )
 
 type DecideOpsRebootArgs struct {
@@ -93,7 +92,7 @@ func DecideOps(c *cke.Cluster, cs *cke.ClusterStatus, constraints *cke.Constrain
 	}
 
 	// 10. Reboot nodes if reboot request has been arrived to the reboot queue, and the number of unreachable nodes is less than a threshold.
-	if ops, phaseReboot := rebootOps(c, constraints, rebootArgs, nf, config.DrainRetryTimes, config.DrainRetryInterval); phaseReboot {
+	if ops, phaseReboot := rebootOps(c, constraints, rebootArgs, nf); phaseReboot {
 		if !nf.EtcdIsGood() {
 			log.Warn("cannot reboot nodes because etcd cluster is not responding and in-sync", nil)
 			return nil, cke.PhaseRebootNodes
@@ -720,7 +719,7 @@ func cleanOps(c *cke.Cluster, nf *NodeFilter) (ops []cke.Operator) {
 	return ops
 }
 
-func rebootOps(c *cke.Cluster, constraints *cke.Constraints, rebootArgs DecideOpsRebootArgs, nf *NodeFilter, drainRetryTimes int, drainRetryInterval time.Duration) (ops []cke.Operator, phaseReboot bool) {
+func rebootOps(c *cke.Cluster, constraints *cke.Constraints, rebootArgs DecideOpsRebootArgs, nf *NodeFilter) (ops []cke.Operator, phaseReboot bool) {
 	if len(rebootArgs.RQEntries) == 0 {
 		return nil, false
 	}
@@ -744,7 +743,7 @@ func rebootOps(c *cke.Cluster, constraints *cke.Constraints, rebootArgs DecideOp
 		if len(nf.SSHNotConnectedNodes(sshCheckNodes, true, true)) > constraints.RebootMaximumUnreachable {
 			log.Warn("cannot reboot nodes because too many nodes are unreachable", nil)
 		} else {
-			ops = append(ops, op.RebootDrainStartOp(nf.HealthyAPIServer(), rebootArgs.NewlyDrained, &c.Reboot, drainRetryTimes, drainRetryInterval))
+			ops = append(ops, op.RebootDrainStartOp(nf.HealthyAPIServer(), rebootArgs.NewlyDrained, &c.Reboot))
 		}
 	}
 	if len(rebootArgs.DrainCompleted) > 0 {
diff --git a/server/strategy_test.go b/server/strategy_test.go
index 6904ed274..909ce59d0 100644
--- a/server/strategy_test.go
+++ b/server/strategy_test.go
@@ -2500,8 +2500,6 @@ func TestDecideOps(t *testing.T) {
 				Interval:             0,
 				CertsGCInterval:      0,
 				MaxConcurrentUpdates: 5,
-				DrainRetryTimes:      3,
-				DrainRetryInterval:   10 * time.Second,
 			})
 			if len(ops) == 0 && len(c.ExpectedOps) == 0 {
 				return
diff --git a/testdata/cluster.yaml b/testdata/cluster.yaml
index f404e90db..2c73026f2 100644
--- a/testdata/cluster.yaml
+++ b/testdata/cluster.yaml
@@ -17,6 +17,8 @@ reboot:
   command_timeout_seconds: 120
   command_retries: 3
   command_interval: 30
+  evict_retries: 10
+  evict_interval: 3
   max_concurrent_reboots: 5
   protected_namespaces:
     matchLabels:

From 6e9fd2af69574ad55d5ed5c4b467d4a46ee94cf5 Mon Sep 17 00:00:00 2001
From: Akihiro Ikezoe <a.ikezoe@gmail.com>
Date: Wed, 16 Aug 2023 19:19:40 +0900
Subject: [PATCH 4/4] Update cluster.go

Co-authored-by: morimoto-cybozu <kenji_morimoto@cybozu.co.jp>
---
 cluster.go | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cluster.go b/cluster.go
index 410c926bd..2a5751653 100644
--- a/cluster.go
+++ b/cluster.go
@@ -498,10 +498,10 @@ func validateReboot(reboot Reboot) error {
 		return errors.New("command_interval must not be negative")
 	}
 	if reboot.EvictRetries != nil && *reboot.EvictRetries < 0 {
-		return errors.New("drain_retries must not be negative")
+		return errors.New("evict_retries must not be negative")
 	}
 	if reboot.EvictInterval != nil && *reboot.EvictInterval < 0 {
-		return errors.New("drain_interval must not be negative")
+		return errors.New("evict_interval must not be negative")
 	}
 	if reboot.MaxConcurrentReboots != nil && *reboot.MaxConcurrentReboots <= 0 {
 		return errors.New("max_concurrent_reboots must be positive")