Skip to content

Commit

Permalink
change CKE to proceed rebooting immediately after draining of node is…
Browse files Browse the repository at this point in the history
… completed and add cancel check in drainBackOff(#707)

change CKE to proceed rebooting immediately after draining of node  is completed
add cancel check in drainBackOff

Signed-off-by: YZ775 <[email protected]>

---------

Signed-off-by: YZ775 <[email protected]>
  • Loading branch information
YZ775 authored May 7, 2024
1 parent e488214 commit 9f43e0b
Show file tree
Hide file tree
Showing 3 changed files with 26 additions and 26 deletions.
7 changes: 7 additions & 0 deletions op/reboot.go
Original file line number Diff line number Diff line change
Expand Up @@ -569,6 +569,13 @@ func drainBackOff(ctx context.Context, inf cke.Infrastructure, entry *cke.Reboot
"name": entry.Node,
log.FnError: err,
})
etcdEntry, err := inf.Storage().GetRebootsEntry(ctx, entry.Index)
if err != nil {
return err
}
if etcdEntry.Status == cke.RebootStatusCancelled {
return nil
}
entry.Status = cke.RebootStatusQueued
entry.LastTransitionTime = time.Now().Truncate(time.Second).UTC()
entry.DrainBackOffCount++
Expand Down
8 changes: 3 additions & 5 deletions op/reboot_decide.go
Original file line number Diff line number Diff line change
Expand Up @@ -196,12 +196,10 @@ func ChooseDrainedNodes(c *cke.Cluster, apiServers map[string]bool, rqEntries []
return nil
}
}
if len(workerInProgress) >= maxConcurrentReboots {
return nil
} else if len(workerInProgress)+len(workerDrainable) <= maxConcurrentReboots {
return workerDrainable
if len(workerInProgress) < maxConcurrentReboots && len(workerDrainable) > 0 {
return workerDrainable[:1]
} else {
return workerDrainable[:maxConcurrentReboots-len(workerInProgress)]
return nil
}
}

Expand Down
37 changes: 16 additions & 21 deletions server/strategy.go
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,7 @@ func DecideOps(c *cke.Cluster, cs *cke.ClusterStatus, constraints *cke.Constrain
}

// 11. Reboot nodes if reboot request has been arrived to the reboot queue, and the number of unreachable nodes is less than a threshold.
if ops, phaseReboot := rebootOps(c, constraints, rebootArgs, nf); phaseReboot {
if ops := rebootOps(c, constraints, rebootArgs, nf); len(ops) > 0 {
if !nf.EtcdIsGood() {
log.Warn("cannot reboot nodes because etcd cluster is not responding and in-sync", nil)
return nil, cke.PhaseRebootNodes
Expand Down Expand Up @@ -874,26 +874,33 @@ func repairOps(c *cke.Cluster, cs *cke.ClusterStatus, constraints *cke.Constrain
return ops, phaseRepair
}

func rebootOps(c *cke.Cluster, constraints *cke.Constraints, rebootArgs DecideOpsRebootArgs, nf *NodeFilter) (ops []cke.Operator, phaseReboot bool) {
func rebootOps(c *cke.Cluster, constraints *cke.Constraints, rebootArgs DecideOpsRebootArgs, nf *NodeFilter) (ops []cke.Operator) {
if len(rebootArgs.RQEntries) == 0 {
return nil, false
return nil
}
if len(c.Reboot.RebootCommand) == 0 {
log.Warn("reboot command is not specified in the cluster configuration", nil)
return nil, false
return nil
}
if len(c.Reboot.BootCheckCommand) == 0 {
log.Warn("boot check command is not specified in the cluster configuration", nil)
return nil, false
return nil
}

if len(rebootArgs.RebootCancelled) > 0 {
phaseReboot = true
ops = append(ops, op.RebootCancelOp(rebootArgs.RebootCancelled))
return ops, phaseReboot
}
if len(rebootArgs.RebootDequeued) > 0 {
ops = append(ops, op.RebootDequeueOp(rebootArgs.RebootDequeued))
}
if len(ops) > 0 {
return ops
}

if len(rebootArgs.DrainCompleted) > 0 {
ops = append(ops, op.RebootRebootOp(nf.HealthyAPIServer(), rebootArgs.DrainCompleted, &c.Reboot))
}
if len(rebootArgs.NewlyDrained) > 0 {
phaseReboot = true
sshCheckNodes := make([]*cke.Node, 0, len(nf.cluster.Nodes))
for _, node := range nf.cluster.Nodes {
if !rebootProcessing(rebootArgs.RQEntries, node.Address) {
Expand All @@ -906,23 +913,11 @@ func rebootOps(c *cke.Cluster, constraints *cke.Constraints, rebootArgs DecideOp
ops = append(ops, op.RebootDrainStartOp(nf.HealthyAPIServer(), rebootArgs.NewlyDrained, &c.Reboot))
}
}
if len(rebootArgs.DrainCompleted) > 0 {
phaseReboot = true
ops = append(ops, op.RebootRebootOp(nf.HealthyAPIServer(), rebootArgs.DrainCompleted, &c.Reboot))
}
if len(rebootArgs.DrainTimedout) > 0 {
phaseReboot = true
ops = append(ops, op.RebootDrainTimeoutOp(rebootArgs.DrainTimedout))
}
if len(rebootArgs.RebootDequeued) > 0 {
phaseReboot = true
ops = append(ops, op.RebootDequeueOp(rebootArgs.RebootDequeued))
}
if len(ops) > 0 {
phaseReboot = true
}

return ops, phaseReboot
return ops
}

func rebootUncordonOp(cs *cke.ClusterStatus, rqEntries []*cke.RebootQueueEntry, nf *NodeFilter) cke.Operator {
Expand Down

0 comments on commit 9f43e0b

Please sign in to comment.