diff --git a/internal/controllers/machine/machine_controller.go b/internal/controllers/machine/machine_controller.go index 07ecd799c059..b954af750b53 100644 --- a/internal/controllers/machine/machine_controller.go +++ b/internal/controllers/machine/machine_controller.go @@ -57,7 +57,8 @@ import ( const ( // controllerName defines the controller used when creating clients. - controllerName = "machine-controller" + controllerName = "machine-controller" + nodeUnreachableKey = "node.kubernetes.io/unschedulable" ) var ( @@ -66,6 +67,11 @@ var ( errNoControlPlaneNodes = errors.New("no control plane members") errClusterIsBeingDeleted = errors.New("cluster is being deleted") errControlPlaneIsBeingDeleted = errors.New("control plane is being deleted") + unreachableToleration = corev1.Toleration{ + Key: nodeUnreachableKey, + Effect: corev1.TaintEffectNoSchedule, + Operator: corev1.TolerationOpExists, + } ) // +kubebuilder:rbac:groups=core,resources=events,verbs=get;list;watch;create;patch @@ -618,6 +624,9 @@ func (r *Reconciler) drainNode(ctx context.Context, cluster *clusterv1.Cluster, ErrOut: writer{func(msg string, keysAndValues ...interface{}) { log.Error(nil, msg, keysAndValues...) }}, + AdditionalFilters: []kubedrain.PodFilter{ + skipUnreachableTolerationPods, + }, // SPECTRO: Even if the node is reachable, we wait 30 minutes for drain completion else move ahead SkipWaitForDeleteTimeoutSeconds: 60 * 30, // 30 minutes } @@ -643,6 +652,16 @@ func (r *Reconciler) drainNode(ctx context.Context, cluster *clusterv1.Cluster, return ctrl.Result{}, nil } +func skipUnreachableTolerationPods(pod corev1.Pod) kubedrain.PodDeleteStatus { + if pod.Spec.Tolerations == nil { + return kubedrain.MakePodDeleteStatusOkay() + } + if HasTolerations(&pod, &unreachableToleration) { + return kubedrain.MakePodDeleteStatusSkip() + } + return kubedrain.MakePodDeleteStatusOkay() +} + // shouldWaitForNodeVolumes returns true if node status still have volumes attached // pod deletion and volume detach happen asynchronously, so pod could be deleted before volume detached from the node // this could cause issue for some storage provisioner, for example, vsphere-volume this is problematic diff --git a/internal/controllers/machine/machine_helpers.go b/internal/controllers/machine/machine_helpers.go index cc6b99a176b9..e653f582c8db 100644 --- a/internal/controllers/machine/machine_helpers.go +++ b/internal/controllers/machine/machine_helpers.go @@ -17,6 +17,7 @@ limitations under the License. package machine import ( + corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/labels" ) @@ -37,3 +38,12 @@ func HasMatchingLabels(matchSelector metav1.LabelSelector, matchLabels map[strin } return true } + +func HasTolerations(pod *corev1.Pod, toleration *corev1.Toleration) bool { + for _, t := range pod.Spec.Tolerations { + if t.MatchToleration(toleration) { + return true + } + } + return false +}