From 6c7d6ea79944b333e299c613ff23f4d44adda06d Mon Sep 17 00:00:00 2001 From: zoetrope Date: Thu, 8 Jun 2023 14:14:53 +0900 Subject: [PATCH 1/4] Retry eviction of pod Signed-off-by: zoetrope --- op/reboot.go | 24 ++++++++++++++++-------- op/reboot_decide.go | 23 +++++++++++++++++++++-- pkg/cke/main.go | 18 +++++++++++++++++- server/config.go | 17 +++++++++++++++++ server/control.go | 19 ++++++++----------- server/strategy.go | 13 +++++++------ 6 files changed, 86 insertions(+), 28 deletions(-) create mode 100644 server/config.go diff --git a/op/reboot.go b/op/reboot.go index 92abc3d4a..02e0a99eb 100644 --- a/op/reboot.go +++ b/op/reboot.go @@ -22,19 +22,23 @@ const drainBackOffBaseSeconds = 60 type rebootDrainStartOp struct { finished bool - entries []*cke.RebootQueueEntry - config *cke.Reboot - apiserver *cke.Node + entries []*cke.RebootQueueEntry + config *cke.Reboot + apiserver *cke.Node + retryTimes int + retryInterval time.Duration mu sync.Mutex failedNodes []string } -func RebootDrainStartOp(apiserver *cke.Node, entries []*cke.RebootQueueEntry, config *cke.Reboot) cke.InfoOperator { +func RebootDrainStartOp(apiserver *cke.Node, entries []*cke.RebootQueueEntry, config *cke.Reboot, retryTimes int, retryInterval time.Duration) cke.InfoOperator { return &rebootDrainStartOp{ - entries: entries, - config: config, - apiserver: apiserver, + entries: entries, + config: config, + apiserver: apiserver, + retryTimes: retryTimes, + retryInterval: retryInterval, } } @@ -42,6 +46,8 @@ type rebootDrainStartCommand struct { entries []*cke.RebootQueueEntry protectedNamespaces *metav1.LabelSelector apiserver *cke.Node + retryTimes int + retryInterval time.Duration notifyFailedNode func(string) } @@ -82,6 +88,8 @@ func (o *rebootDrainStartOp) NextCommand() cke.Commander { protectedNamespaces: o.config.ProtectedNamespaces, apiserver: o.apiserver, notifyFailedNode: o.notifyFailedNode, + retryTimes: o.retryTimes, + retryInterval: o.retryInterval, } } @@ -156,7 +164,7 @@ func (c rebootDrainStartCommand) Run(ctx context.Context, inf cke.Infrastructure // next, evict pods on each node for _, entry := range evictNodes { - err := evictOrDeleteNodePod(ctx, cs, entry.Node, protected) + err := evictOrDeleteNodePod(ctx, cs, entry.Node, protected, c.retryTimes, c.retryInterval) if err != nil { c.notifyFailedNode(entry.Node) err = drainBackOff(ctx, inf, entry, err) diff --git a/op/reboot_decide.go b/op/reboot_decide.go index ecb53f483..9a7b0a902 100644 --- a/op/reboot_decide.go +++ b/op/reboot_decide.go @@ -74,14 +74,21 @@ func checkJobPodNotExist(ctx context.Context, cs *kubernetes.Clientset, node str // evictOrDeleteNodePod evicts or delete Pods on the specified Node. // It first tries eviction. If the eviction failed and the Pod's namespace is not protected, it deletes the Pod. // If a running Job Pod exists, this function returns an error. -func evictOrDeleteNodePod(ctx context.Context, cs *kubernetes.Clientset, node string, protected map[string]bool) error { +func evictOrDeleteNodePod(ctx context.Context, cs *kubernetes.Clientset, node string, protected map[string]bool, retryTimes int, retryInterval time.Duration) error { return enumeratePods(ctx, cs, node, func(pod *corev1.Pod) error { + retryCount := 0 + EVICT: + log.Info("start evicting pod", map[string]interface{}{ + "namespace": pod.Namespace, + "name": pod.Name, + }) err := cs.CoreV1().Pods(pod.Namespace).EvictV1(ctx, &policyv1.Eviction{ ObjectMeta: metav1.ObjectMeta{Name: pod.Name, Namespace: pod.Namespace}, }) + retryCount++ switch { case err == nil: - log.Info("start evicting pod", map[string]interface{}{ + log.Info("evicted pod", map[string]interface{}{ "namespace": pod.Namespace, "name": pod.Name, }) @@ -105,6 +112,18 @@ func evictOrDeleteNodePod(ctx context.Context, cs *kubernetes.Clientset, node st "name": pod.Name, }) default: + if retryCount < retryTimes { + select { + case <-ctx.Done(): + return ctx.Err() + case <-time.After(retryInterval): + } + log.Info("retry eviction of pod", map[string]interface{}{ + "namespace": pod.Namespace, + "name": pod.Name, + }) + goto EVICT + } return fmt.Errorf("failed to evict pod %s/%s due to PDB: %w", pod.Namespace, pod.Name, err) } return nil diff --git a/pkg/cke/main.go b/pkg/cke/main.go index df322eb37..2aeaa0fe9 100644 --- a/pkg/cke/main.go +++ b/pkg/cke/main.go @@ -27,6 +27,8 @@ var ( flgSessionTTL = pflag.String("session-ttl", "60s", "leader session's TTL") flgDebugSabakan = pflag.Bool("debug-sabakan", false, "debug sabakan integration") flgMaxConcurrentUpdates = pflag.Int("max-concurrent-updates", 10, "the maximum number of components that can be updated simultaneously") + flgDrainRetryTimes = pflag.Int("drain-retry-times", 30, "number of drain retries") + flgDrainRetryInterval = pflag.String("drain-retry-interval", "10s", "interval between drain retries") ) func loadConfig(p string) (*etcdutil.Config, error) { @@ -114,9 +116,23 @@ func main() { if maxConcurrentUpdates <= 0 { log.ErrorExit(errors.New("max-concurrent-updates must be greater than 0")) } + retryTimes := *flgDrainRetryTimes + if retryTimes <= 0 { + log.ErrorExit(errors.New("drain-retry-times must be greater than 0")) + } + retryInterval, err := time.ParseDuration(*flgDrainRetryInterval) + if err != nil { + log.ErrorExit(err) + } // Controller - controller := server.NewController(session, interval, gcInterval, timeout, addon, maxConcurrentUpdates) + controller := server.NewController(session, addon, &server.Config{ + Interval: interval, + CertsGCInterval: gcInterval, + MaxConcurrentUpdates: maxConcurrentUpdates, + DrainRetryTimes: retryTimes, + DrainRetryInterval: retryInterval, + }) well.Go(controller.Run) // API server diff --git a/server/config.go b/server/config.go new file mode 100644 index 000000000..1625865c1 --- /dev/null +++ b/server/config.go @@ -0,0 +1,17 @@ +package server + +import "time" + +// Config is the configuration for cke-server. +type Config struct { + // Interval is the interval of the main loop. + Interval time.Duration + // CertsGCInterval is the interval of the certificate garbage collection. + CertsGCInterval time.Duration + // MaxConcurrentUpdates is the maximum number of concurrent updates. + MaxConcurrentUpdates int + // DrainRetryTimes is the number of retries for drain. + DrainRetryTimes int + // DrainRetryInterval is the interval of retries for drain. + DrainRetryInterval time.Duration +} diff --git a/server/control.go b/server/control.go index bcd34eec7..4507ea2ef 100644 --- a/server/control.go +++ b/server/control.go @@ -23,17 +23,14 @@ var ( // Controller manage operations type Controller struct { - session *concurrency.Session - interval time.Duration - certsGCInterval time.Duration - timeout time.Duration - addon Integrator - maxConcurrentUpdates int + session *concurrency.Session + addon Integrator + config *Config } // NewController construct controller instance -func NewController(s *concurrency.Session, interval, gcInterval, timeout time.Duration, addon Integrator, maxConcurrentUpdates int) Controller { - return Controller{s, interval, gcInterval, timeout, addon, maxConcurrentUpdates} +func NewController(s *concurrency.Session, addon Integrator, config *Config) Controller { + return Controller{s, addon, config} } // Run execute procedures with leader elections @@ -149,7 +146,7 @@ func (c Controller) runLoop(ctx context.Context, leaderKey string) error { case <-ctx.Done(): return ctx.Err() } - ticker := time.NewTicker(c.interval) + ticker := time.NewTicker(c.config.Interval) defer ticker.Stop() for { select { @@ -170,7 +167,7 @@ func (c Controller) runLoop(ctx context.Context, leaderKey string) error { return ctx.Err() default: } - ticker := time.NewTicker(c.certsGCInterval) + ticker := time.NewTicker(c.config.CertsGCInterval) defer ticker.Stop() for { select { @@ -346,7 +343,7 @@ func (c Controller) runOnce(ctx context.Context, leaderKey string, tick <-chan t DrainCompleted: drainCompleted, DrainTimedout: drainTimedout, RebootDequeued: rebootDequeued, - }, c.maxConcurrentUpdates) + }, c.config) st := &cke.ServerStatus{ Phase: phase, diff --git a/server/strategy.go b/server/strategy.go index 852ba4748..2a7b42062 100644 --- a/server/strategy.go +++ b/server/strategy.go @@ -12,6 +12,7 @@ import ( corev1 "k8s.io/api/core/v1" discoveryv1 "k8s.io/api/discovery/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "time" ) type DecideOpsRebootArgs struct { @@ -24,7 +25,7 @@ type DecideOpsRebootArgs struct { // DecideOps returns the next operations to do and the operation phase. // This returns nil when no operations need to be done. -func DecideOps(c *cke.Cluster, cs *cke.ClusterStatus, constraints *cke.Constraints, resources []cke.ResourceDefinition, rebootArgs DecideOpsRebootArgs, maxConcurrentUpdates int) ([]cke.Operator, cke.OperationPhase) { +func DecideOps(c *cke.Cluster, cs *cke.ClusterStatus, constraints *cke.Constraints, resources []cke.ResourceDefinition, rebootArgs DecideOpsRebootArgs, config *Config) ([]cke.Operator, cke.OperationPhase) { nf := NewNodeFilter(c, cs) // 0. Execute upgrade operation if necessary @@ -40,7 +41,7 @@ func DecideOps(c *cke.Cluster, cs *cke.ClusterStatus, constraints *cke.Constrain // 1. Run or restart rivers. This guarantees: // - CKE tools image is pulled on all nodes. // - Rivers runs on all nodes and will proxy requests only to control plane nodes. - if ops := riversOps(c, nf, maxConcurrentUpdates); len(ops) > 0 { + if ops := riversOps(c, nf, config.MaxConcurrentUpdates); len(ops) > 0 { return ops, cke.PhaseRivers } @@ -65,7 +66,7 @@ func DecideOps(c *cke.Cluster, cs *cke.ClusterStatus, constraints *cke.Constrain } // 5. Run or restart kubernetes components. - if ops := k8sOps(c, nf, cs, maxConcurrentUpdates); len(ops) > 0 { + if ops := k8sOps(c, nf, cs, config.MaxConcurrentUpdates); len(ops) > 0 { return ops, cke.PhaseK8sStart } @@ -92,7 +93,7 @@ func DecideOps(c *cke.Cluster, cs *cke.ClusterStatus, constraints *cke.Constrain } // 10. Reboot nodes if reboot request has been arrived to the reboot queue, and the number of unreachable nodes is less than a threshold. - if ops, phaseReboot := rebootOps(c, constraints, rebootArgs, nf); phaseReboot { + if ops, phaseReboot := rebootOps(c, constraints, rebootArgs, nf, config.DrainRetryTimes, config.DrainRetryInterval); phaseReboot { if !nf.EtcdIsGood() { log.Warn("cannot reboot nodes because etcd cluster is not responding and in-sync", nil) return nil, cke.PhaseRebootNodes @@ -719,7 +720,7 @@ func cleanOps(c *cke.Cluster, nf *NodeFilter) (ops []cke.Operator) { return ops } -func rebootOps(c *cke.Cluster, constraints *cke.Constraints, rebootArgs DecideOpsRebootArgs, nf *NodeFilter) (ops []cke.Operator, phaseReboot bool) { +func rebootOps(c *cke.Cluster, constraints *cke.Constraints, rebootArgs DecideOpsRebootArgs, nf *NodeFilter, drainRetryTimes int, drainRetryInterval time.Duration) (ops []cke.Operator, phaseReboot bool) { if len(rebootArgs.RQEntries) == 0 { return nil, false } @@ -743,7 +744,7 @@ func rebootOps(c *cke.Cluster, constraints *cke.Constraints, rebootArgs DecideOp if len(nf.SSHNotConnectedNodes(sshCheckNodes, true, true)) > constraints.RebootMaximumUnreachable { log.Warn("cannot reboot nodes because too many nodes are unreachable", nil) } else { - ops = append(ops, op.RebootDrainStartOp(nf.HealthyAPIServer(), rebootArgs.NewlyDrained, &c.Reboot)) + ops = append(ops, op.RebootDrainStartOp(nf.HealthyAPIServer(), rebootArgs.NewlyDrained, &c.Reboot, drainRetryTimes, drainRetryInterval)) } } if len(rebootArgs.DrainCompleted) > 0 { From d7cd71610f0b1f5d110107f778c5f5e15f76f312 Mon Sep 17 00:00:00 2001 From: zoetrope Date: Thu, 8 Jun 2023 14:14:53 +0900 Subject: [PATCH 2/4] Retry eviction of pod Signed-off-by: zoetrope --- mtest/run_test.go | 10 ++++++++-- server/strategy_test.go | 8 +++++++- 2 files changed, 15 insertions(+), 3 deletions(-) diff --git a/mtest/run_test.go b/mtest/run_test.go index efe3d3725..f6fb328dd 100644 --- a/mtest/run_test.go +++ b/mtest/run_test.go @@ -219,7 +219,7 @@ func runCKE(image string) error { "-e GOFAIL_HTTP=0.0.0.0:1234 " + "--mount type=bind,source=/var/lib/cke,target=/var/lib/cke " + "--mount type=bind,source=/etc/cke/,target=/etc/cke/ " + - image + " --config /etc/cke/cke.yml --interval 3s --certs-gc-interval 5m --session-ttl 5s --loglevel debug") + image + " --config /etc/cke/cke.yml --interval 3s --certs-gc-interval 5m --session-ttl 5s --loglevel debug --drain-retry-times 3 --drain-retry-interval 1s") }) } env.Stop() @@ -341,7 +341,13 @@ func connectEtcd() (*clientv3.Client, error) { } func getClusterStatus(cluster *cke.Cluster) (*cke.ClusterStatus, []cke.ResourceDefinition, error) { - controller := server.NewController(nil, 0, time.Hour, time.Second*2, nil, 10) + controller := server.NewController(nil, nil, &server.Config{ + Interval: 0, + CertsGCInterval: time.Hour, + MaxConcurrentUpdates: 10, + DrainRetryTimes: 10, + DrainRetryInterval: 3 * time.Second, + }) etcd, err := connectEtcd() if err != nil { diff --git a/server/strategy_test.go b/server/strategy_test.go index 602d7f175..6904ed274 100644 --- a/server/strategy_test.go +++ b/server/strategy_test.go @@ -2496,7 +2496,13 @@ func TestDecideOps(t *testing.T) { for _, c := range cases { t.Run(c.Name, func(t *testing.T) { - ops, _ := DecideOps(c.Input.Cluster, c.Input.Status, c.Input.Constraints, c.Input.Resources, c.Input.RebootArgs, 5) + ops, _ := DecideOps(c.Input.Cluster, c.Input.Status, c.Input.Constraints, c.Input.Resources, c.Input.RebootArgs, &Config{ + Interval: 0, + CertsGCInterval: 0, + MaxConcurrentUpdates: 5, + DrainRetryTimes: 3, + DrainRetryInterval: 10 * time.Second, + }) if len(ops) == 0 && len(c.ExpectedOps) == 0 { return } From 3992d8fe7bb58a14fb7c53c209cc5e8aa07c3685 Mon Sep 17 00:00:00 2001 From: zoetrope Date: Thu, 10 Aug 2023 17:54:46 +0900 Subject: [PATCH 3/4] Moved parameters to cluster.yaml --- .gitignore | 1 + cluster.go | 8 ++++++++ cluster_test.go | 12 ++++++++++++ docs/cluster.md | 5 ++++- mtest/run_test.go | 4 +--- op/reboot.go | 37 +++++++++++++++++++++---------------- op/reboot_decide.go | 10 +++++----- pkg/cke/main.go | 12 ------------ server/config.go | 4 ---- server/strategy.go | 7 +++---- server/strategy_test.go | 2 -- testdata/cluster.yaml | 2 ++ 12 files changed, 57 insertions(+), 47 deletions(-) diff --git a/.gitignore b/.gitignore index d5db6b6a5..da69d4330 100644 --- a/.gitignore +++ b/.gitignore @@ -6,6 +6,7 @@ # Project-local glide cache, RE: https://github.com/Masterminds/glide/issues/736 .glide/ +vendor/ # Editors *~ diff --git a/cluster.go b/cluster.go index 2b4579bb7..410c926bd 100644 --- a/cluster.go +++ b/cluster.go @@ -277,6 +277,8 @@ type Reboot struct { CommandTimeoutSeconds *int `json:"command_timeout_seconds,omitempty"` CommandRetries *int `json:"command_retries"` CommandInterval *int `json:"command_interval"` + EvictRetries *int `json:"evict_retries"` + EvictInterval *int `json:"evict_interval"` ProtectedNamespaces *metav1.LabelSelector `json:"protected_namespaces,omitempty"` } @@ -495,6 +497,12 @@ func validateReboot(reboot Reboot) error { if reboot.CommandInterval != nil && *reboot.CommandInterval < 0 { return errors.New("command_interval must not be negative") } + if reboot.EvictRetries != nil && *reboot.EvictRetries < 0 { + return errors.New("drain_retries must not be negative") + } + if reboot.EvictInterval != nil && *reboot.EvictInterval < 0 { + return errors.New("drain_interval must not be negative") + } if reboot.MaxConcurrentReboots != nil && *reboot.MaxConcurrentReboots <= 0 { return errors.New("max_concurrent_reboots must be positive") } diff --git a/cluster_test.go b/cluster_test.go index 4eb82f987..185db19b8 100644 --- a/cluster_test.go +++ b/cluster_test.go @@ -110,6 +110,18 @@ func testClusterYAML(t *testing.T) { if *c.Reboot.CommandInterval != 30 { t.Error(`*c.Reboot.CommandInterval != 30`) } + if c.Reboot.EvictRetries == nil { + t.Fatal(`c.Reboot.EvictRetries == nil`) + } + if *c.Reboot.EvictRetries != 10 { + t.Error(`*c.Reboot.EvictRetries != 10`) + } + if c.Reboot.EvictInterval == nil { + t.Fatal(`c.Reboot.EvictInterval == nil`) + } + if *c.Reboot.EvictInterval != 3 { + t.Error(`*c.Reboot.EvictInterval != 3`) + } if c.Reboot.ProtectedNamespaces == nil { t.Fatal(`c.Reboot.ProtectedNamespaces == nil`) } diff --git a/docs/cluster.md b/docs/cluster.md index c60e01a16..dc1caa639 100644 --- a/docs/cluster.md +++ b/docs/cluster.md @@ -68,13 +68,15 @@ Reboot ------ | Name | Required | Type | Description | -| -------------------------- | -------- | -------------------------------- | ----------------------------------------------------------------------- | +|----------------------------| -------- | -------------------------------- |-------------------------------------------------------------------------| | `reboot_command` | true | array | A command to reboot. List of strings. | | `boot_check_command` | true | array | A command to check nodes booted. List of strings. | | `eviction_timeout_seconds` | false | *int | Deadline for eviction. Must be positive. Default: 600 (10 minutes). | | `command_timeout_seconds` | false | *int | Deadline for rebooting. Zero means infinity. Default: wait indefinitely | | `command_retries` | false | *int | Number of reboot retries, not including initial attempt. Default: 0 | | `command_interval` | false | *int | Interval of time between reboot retries in seconds. Default: 0 | +| `evict_retries` | false | *int | Number of eviction retries, not including initial attempt. Default: 0 | +| `evict_interval` | false | *int | Interval of time between eviction retries in seconds. Default: 0 | | `max_concurrent_reboots` | false | *int | Maximum number of nodes to be rebooted concurrently. Default: 1 | | `protected_namespaces` | false | [`LabelSelector`][LabelSelector] | A label selector to protect namespaces. | @@ -89,6 +91,7 @@ If the node is not booted yet, this command should output `false` to stdout and If `command_timeout_seconds` is specified, the check command should return within `command_timeout_seconds` seconds, or it is considered failed. CKE tries to delete Pods in the `protected_namespaces` gracefully with the Kubernetes eviction API. +If the eviction API has failed, CKE retries it for `evict_retries` times with `evict_interval`-second interval. If any of the Pods cannot be deleted, it aborts the operation. The Pods in the non-protected namespaces are also tried to be deleted gracefully with the Kubernetes eviction API, but they would be simply deleted if eviction is denied. diff --git a/mtest/run_test.go b/mtest/run_test.go index f6fb328dd..3e9b9a01b 100644 --- a/mtest/run_test.go +++ b/mtest/run_test.go @@ -219,7 +219,7 @@ func runCKE(image string) error { "-e GOFAIL_HTTP=0.0.0.0:1234 " + "--mount type=bind,source=/var/lib/cke,target=/var/lib/cke " + "--mount type=bind,source=/etc/cke/,target=/etc/cke/ " + - image + " --config /etc/cke/cke.yml --interval 3s --certs-gc-interval 5m --session-ttl 5s --loglevel debug --drain-retry-times 3 --drain-retry-interval 1s") + image + " --config /etc/cke/cke.yml --interval 3s --certs-gc-interval 5m --session-ttl 5s --loglevel debug") }) } env.Stop() @@ -345,8 +345,6 @@ func getClusterStatus(cluster *cke.Cluster) (*cke.ClusterStatus, []cke.ResourceD Interval: 0, CertsGCInterval: time.Hour, MaxConcurrentUpdates: 10, - DrainRetryTimes: 10, - DrainRetryInterval: 3 * time.Second, }) etcd, err := connectEtcd() diff --git a/op/reboot.go b/op/reboot.go index 02e0a99eb..2a9533072 100644 --- a/op/reboot.go +++ b/op/reboot.go @@ -22,23 +22,19 @@ const drainBackOffBaseSeconds = 60 type rebootDrainStartOp struct { finished bool - entries []*cke.RebootQueueEntry - config *cke.Reboot - apiserver *cke.Node - retryTimes int - retryInterval time.Duration + entries []*cke.RebootQueueEntry + config *cke.Reboot + apiserver *cke.Node mu sync.Mutex failedNodes []string } -func RebootDrainStartOp(apiserver *cke.Node, entries []*cke.RebootQueueEntry, config *cke.Reboot, retryTimes int, retryInterval time.Duration) cke.InfoOperator { +func RebootDrainStartOp(apiserver *cke.Node, entries []*cke.RebootQueueEntry, config *cke.Reboot) cke.InfoOperator { return &rebootDrainStartOp{ - entries: entries, - config: config, - apiserver: apiserver, - retryTimes: retryTimes, - retryInterval: retryInterval, + entries: entries, + config: config, + apiserver: apiserver, } } @@ -46,8 +42,8 @@ type rebootDrainStartCommand struct { entries []*cke.RebootQueueEntry protectedNamespaces *metav1.LabelSelector apiserver *cke.Node - retryTimes int - retryInterval time.Duration + evictAttempts int + evictInterval time.Duration notifyFailedNode func(string) } @@ -83,13 +79,22 @@ func (o *rebootDrainStartOp) NextCommand() cke.Commander { } o.finished = true + attempts := 1 + if o.config.EvictRetries != nil { + attempts = *o.config.EvictRetries + 1 + } + interval := 0 * time.Second + if o.config.EvictInterval != nil { + interval = time.Second * time.Duration(*o.config.EvictInterval) + } + return rebootDrainStartCommand{ entries: o.entries, protectedNamespaces: o.config.ProtectedNamespaces, apiserver: o.apiserver, notifyFailedNode: o.notifyFailedNode, - retryTimes: o.retryTimes, - retryInterval: o.retryInterval, + evictAttempts: attempts, + evictInterval: interval, } } @@ -164,7 +169,7 @@ func (c rebootDrainStartCommand) Run(ctx context.Context, inf cke.Infrastructure // next, evict pods on each node for _, entry := range evictNodes { - err := evictOrDeleteNodePod(ctx, cs, entry.Node, protected, c.retryTimes, c.retryInterval) + err := evictOrDeleteNodePod(ctx, cs, entry.Node, protected, c.evictAttempts, c.evictInterval) if err != nil { c.notifyFailedNode(entry.Node) err = drainBackOff(ctx, inf, entry, err) diff --git a/op/reboot_decide.go b/op/reboot_decide.go index 9a7b0a902..14e0bb7a1 100644 --- a/op/reboot_decide.go +++ b/op/reboot_decide.go @@ -74,9 +74,9 @@ func checkJobPodNotExist(ctx context.Context, cs *kubernetes.Clientset, node str // evictOrDeleteNodePod evicts or delete Pods on the specified Node. // It first tries eviction. If the eviction failed and the Pod's namespace is not protected, it deletes the Pod. // If a running Job Pod exists, this function returns an error. -func evictOrDeleteNodePod(ctx context.Context, cs *kubernetes.Clientset, node string, protected map[string]bool, retryTimes int, retryInterval time.Duration) error { +func evictOrDeleteNodePod(ctx context.Context, cs *kubernetes.Clientset, node string, protected map[string]bool, attempts int, interval time.Duration) error { return enumeratePods(ctx, cs, node, func(pod *corev1.Pod) error { - retryCount := 0 + evictCount := 0 EVICT: log.Info("start evicting pod", map[string]interface{}{ "namespace": pod.Namespace, @@ -85,7 +85,7 @@ func evictOrDeleteNodePod(ctx context.Context, cs *kubernetes.Clientset, node st err := cs.CoreV1().Pods(pod.Namespace).EvictV1(ctx, &policyv1.Eviction{ ObjectMeta: metav1.ObjectMeta{Name: pod.Name, Namespace: pod.Namespace}, }) - retryCount++ + evictCount++ switch { case err == nil: log.Info("evicted pod", map[string]interface{}{ @@ -112,11 +112,11 @@ func evictOrDeleteNodePod(ctx context.Context, cs *kubernetes.Clientset, node st "name": pod.Name, }) default: - if retryCount < retryTimes { + if evictCount < attempts { select { case <-ctx.Done(): return ctx.Err() - case <-time.After(retryInterval): + case <-time.After(interval): } log.Info("retry eviction of pod", map[string]interface{}{ "namespace": pod.Namespace, diff --git a/pkg/cke/main.go b/pkg/cke/main.go index 2aeaa0fe9..a572de4d1 100644 --- a/pkg/cke/main.go +++ b/pkg/cke/main.go @@ -27,8 +27,6 @@ var ( flgSessionTTL = pflag.String("session-ttl", "60s", "leader session's TTL") flgDebugSabakan = pflag.Bool("debug-sabakan", false, "debug sabakan integration") flgMaxConcurrentUpdates = pflag.Int("max-concurrent-updates", 10, "the maximum number of components that can be updated simultaneously") - flgDrainRetryTimes = pflag.Int("drain-retry-times", 30, "number of drain retries") - flgDrainRetryInterval = pflag.String("drain-retry-interval", "10s", "interval between drain retries") ) func loadConfig(p string) (*etcdutil.Config, error) { @@ -116,22 +114,12 @@ func main() { if maxConcurrentUpdates <= 0 { log.ErrorExit(errors.New("max-concurrent-updates must be greater than 0")) } - retryTimes := *flgDrainRetryTimes - if retryTimes <= 0 { - log.ErrorExit(errors.New("drain-retry-times must be greater than 0")) - } - retryInterval, err := time.ParseDuration(*flgDrainRetryInterval) - if err != nil { - log.ErrorExit(err) - } // Controller controller := server.NewController(session, addon, &server.Config{ Interval: interval, CertsGCInterval: gcInterval, MaxConcurrentUpdates: maxConcurrentUpdates, - DrainRetryTimes: retryTimes, - DrainRetryInterval: retryInterval, }) well.Go(controller.Run) diff --git a/server/config.go b/server/config.go index 1625865c1..dab7dbc9e 100644 --- a/server/config.go +++ b/server/config.go @@ -10,8 +10,4 @@ type Config struct { CertsGCInterval time.Duration // MaxConcurrentUpdates is the maximum number of concurrent updates. MaxConcurrentUpdates int - // DrainRetryTimes is the number of retries for drain. - DrainRetryTimes int - // DrainRetryInterval is the interval of retries for drain. - DrainRetryInterval time.Duration } diff --git a/server/strategy.go b/server/strategy.go index 2a7b42062..2aeb3d747 100644 --- a/server/strategy.go +++ b/server/strategy.go @@ -12,7 +12,6 @@ import ( corev1 "k8s.io/api/core/v1" discoveryv1 "k8s.io/api/discovery/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "time" ) type DecideOpsRebootArgs struct { @@ -93,7 +92,7 @@ func DecideOps(c *cke.Cluster, cs *cke.ClusterStatus, constraints *cke.Constrain } // 10. Reboot nodes if reboot request has been arrived to the reboot queue, and the number of unreachable nodes is less than a threshold. - if ops, phaseReboot := rebootOps(c, constraints, rebootArgs, nf, config.DrainRetryTimes, config.DrainRetryInterval); phaseReboot { + if ops, phaseReboot := rebootOps(c, constraints, rebootArgs, nf); phaseReboot { if !nf.EtcdIsGood() { log.Warn("cannot reboot nodes because etcd cluster is not responding and in-sync", nil) return nil, cke.PhaseRebootNodes @@ -720,7 +719,7 @@ func cleanOps(c *cke.Cluster, nf *NodeFilter) (ops []cke.Operator) { return ops } -func rebootOps(c *cke.Cluster, constraints *cke.Constraints, rebootArgs DecideOpsRebootArgs, nf *NodeFilter, drainRetryTimes int, drainRetryInterval time.Duration) (ops []cke.Operator, phaseReboot bool) { +func rebootOps(c *cke.Cluster, constraints *cke.Constraints, rebootArgs DecideOpsRebootArgs, nf *NodeFilter) (ops []cke.Operator, phaseReboot bool) { if len(rebootArgs.RQEntries) == 0 { return nil, false } @@ -744,7 +743,7 @@ func rebootOps(c *cke.Cluster, constraints *cke.Constraints, rebootArgs DecideOp if len(nf.SSHNotConnectedNodes(sshCheckNodes, true, true)) > constraints.RebootMaximumUnreachable { log.Warn("cannot reboot nodes because too many nodes are unreachable", nil) } else { - ops = append(ops, op.RebootDrainStartOp(nf.HealthyAPIServer(), rebootArgs.NewlyDrained, &c.Reboot, drainRetryTimes, drainRetryInterval)) + ops = append(ops, op.RebootDrainStartOp(nf.HealthyAPIServer(), rebootArgs.NewlyDrained, &c.Reboot)) } } if len(rebootArgs.DrainCompleted) > 0 { diff --git a/server/strategy_test.go b/server/strategy_test.go index 6904ed274..909ce59d0 100644 --- a/server/strategy_test.go +++ b/server/strategy_test.go @@ -2500,8 +2500,6 @@ func TestDecideOps(t *testing.T) { Interval: 0, CertsGCInterval: 0, MaxConcurrentUpdates: 5, - DrainRetryTimes: 3, - DrainRetryInterval: 10 * time.Second, }) if len(ops) == 0 && len(c.ExpectedOps) == 0 { return diff --git a/testdata/cluster.yaml b/testdata/cluster.yaml index f404e90db..2c73026f2 100644 --- a/testdata/cluster.yaml +++ b/testdata/cluster.yaml @@ -17,6 +17,8 @@ reboot: command_timeout_seconds: 120 command_retries: 3 command_interval: 30 + evict_retries: 10 + evict_interval: 3 max_concurrent_reboots: 5 protected_namespaces: matchLabels: From 6e9fd2af69574ad55d5ed5c4b467d4a46ee94cf5 Mon Sep 17 00:00:00 2001 From: Akihiro Ikezoe Date: Wed, 16 Aug 2023 19:19:40 +0900 Subject: [PATCH 4/4] Update cluster.go Co-authored-by: morimoto-cybozu --- cluster.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cluster.go b/cluster.go index 410c926bd..2a5751653 100644 --- a/cluster.go +++ b/cluster.go @@ -498,10 +498,10 @@ func validateReboot(reboot Reboot) error { return errors.New("command_interval must not be negative") } if reboot.EvictRetries != nil && *reboot.EvictRetries < 0 { - return errors.New("drain_retries must not be negative") + return errors.New("evict_retries must not be negative") } if reboot.EvictInterval != nil && *reboot.EvictInterval < 0 { - return errors.New("drain_interval must not be negative") + return errors.New("evict_interval must not be negative") } if reboot.MaxConcurrentReboots != nil && *reboot.MaxConcurrentReboots <= 0 { return errors.New("max_concurrent_reboots must be positive")