diff --git a/api/v1beta1/machinedrainrules_types.go b/api/v1beta1/machinedrainrules_types.go index b5e0916b663f..da025ac2b272 100644 --- a/api/v1beta1/machinedrainrules_types.go +++ b/api/v1beta1/machinedrainrules_types.go @@ -22,13 +22,13 @@ import ( const ( // PodDrainLabel is the label that can be set on Pods in workload clusters to ensure a Pod is not drained. - // The only valid value is "skip". + // The only valid values are "skip" and "wait-completed". // This label takes precedence over MachineDrainRules defined in the management cluster. PodDrainLabel = "cluster.x-k8s.io/drain" ) -// MachineDrainRuleDrainBehavior defines the drain behavior. Can be either "Drain" or "Skip". -// +kubebuilder:validation:Enum=Drain;Skip +// MachineDrainRuleDrainBehavior defines the drain behavior. Can be either "Drain", "Skip", or "WaitCompleted". +// +kubebuilder:validation:Enum=Drain;Skip;WaitCompleted type MachineDrainRuleDrainBehavior string const ( @@ -37,6 +37,10 @@ const ( // MachineDrainRuleDrainBehaviorSkip means the drain for a Pod should be skipped. MachineDrainRuleDrainBehaviorSkip MachineDrainRuleDrainBehavior = "Skip" + + // MachineDrainRuleDrainBehaviorWaitCompleted means the Pod should not be evicted, + // but overall drain should wait until the Pod completes. + MachineDrainRuleDrainBehaviorWaitCompleted MachineDrainRuleDrainBehavior = "WaitCompleted" ) // MachineDrainRuleSpec defines the spec of a MachineDrainRule. diff --git a/config/crd/bases/cluster.x-k8s.io_machinedrainrules.yaml b/config/crd/bases/cluster.x-k8s.io_machinedrainrules.yaml index 09a1536cd704..7941d5aedad4 100644 --- a/config/crd/bases/cluster.x-k8s.io_machinedrainrules.yaml +++ b/config/crd/bases/cluster.x-k8s.io_machinedrainrules.yaml @@ -71,6 +71,7 @@ spec: enum: - Drain - Skip + - WaitCompleted type: string order: description: |- diff --git a/docs/proposals/20240930-machine-drain-rules.md b/docs/proposals/20240930-machine-drain-rules.md index aebe9cbe14e6..99926594f617 100644 --- a/docs/proposals/20240930-machine-drain-rules.md +++ b/docs/proposals/20240930-machine-drain-rules.md @@ -46,56 +46,56 @@ status: implementable ## Summary -Today, when Cluster API deletes a Machine it drains the corresponding Node to ensure all Pods running on the Node have -been gracefully terminated before deleting the corresponding infrastructure. The current drain implementation has -hard-coded rules to decide which Pods should be evicted. This implementation is aligned to `kubectl drain` (see -[Machine deletion process](https://main.cluster-api.sigs.k8s.io/tasks/automated-machine-management/machine_deletions) +Today, when Cluster API deletes a Machine it drains the corresponding Node to ensure all Pods running on the Node have +been gracefully terminated before deleting the corresponding infrastructure. The current drain implementation has +hard-coded rules to decide which Pods should be evicted. This implementation is aligned to `kubectl drain` (see +[Machine deletion process](https://main.cluster-api.sigs.k8s.io/tasks/automated-machine-management/machine_deletions) for more details). -With recent changes in Cluster API, we can now have finer control on the drain process, and thus we propose a new -`MachineDrainRule` CRD to make the drain rules configurable per Pod. Additionally, we're proposing labels that +With recent changes in Cluster API, we can now have finer control on the drain process, and thus we propose a new +`MachineDrainRule` CRD to make the drain rules configurable per Pod. Additionally, we're proposing labels that workload cluster admins can add to individual Pods to control their drain behavior. -This would be a huge improvement over the “standard” `kubectl drain` aligned implementation we have today and help to +This would be a huge improvement over the “standard” `kubectl drain` aligned implementation we have today and help to solve a family of issues identified when running Cluster API in production. ## Motivation -While the “standard” `kubectl drain` rules have served us well, new user stories have emerged where a more sophisticated +While the “standard” `kubectl drain` rules have served us well, new user stories have emerged where a more sophisticated behavior is needed. ### User Stories **Story 1: Define drain order** -As a cluster admin/user, I would like to define in which order Pods are evicted. For example, I want to configure that +As a cluster admin/user, I would like to define in which order Pods are evicted. For example, I want to configure that Portworx Pods are evicted after all other Pods ([CAPI-11024](https://github.com/kubernetes-sigs/cluster-api/issues/11024)). -Note: As of today the only way to address this use case with Cluster API is to implement a pre-drain hook with a custom +Note: As of today the only way to address this use case with Cluster API is to implement a pre-drain hook with a custom drain process. Due to the complexity of this solution it is not viable for most users. **Story 2: Exclude “undrainable” Pods from drain** -As a cluster admin/user, I would like the node drain process to ignore pods that will obviously not drain properly +As a cluster admin/user, I would like the node drain process to ignore pods that will obviously not drain properly (e.g., because they're tolerating the unschedulable or all taints). -Note: As of today Pods that are tolerating the `node.kubernetes.io/unschedulable:NoSchedule` taint will just get rescheduled +Note: As of today Pods that are tolerating the `node.kubernetes.io/unschedulable:NoSchedule` taint will just get rescheduled to the Node after they have been evicted, and this can lead to race conditions and then prevent the Machine to be deleted. **Story 3: Exclude Pods that should not be drained from drain** -As a cluster admin/user, I would like to instruct the Node drain process to exclude some Pods, e.g. because I don't +As a cluster admin/user, I would like to instruct the Node drain process to exclude some Pods, e.g. because I don't care if they're gracefully terminated and they should run until the Node goes offline (e.g. monitoring agents). **Story 4: Define drain order & exclude Pods from drain without management cluster access** -As a workload cluster admin/user, I want to be able to define drain order of Pods and also exclude Pods from drain +As a workload cluster admin/user, I want to be able to define drain order of Pods and also exclude Pods from drain (similar to Story 1-3, just without management cluster access). **Story 5: Apply drain configuration to a set of Machines / Clusters** -As a cluster admin/user, I would like to be able to configure the drain behavior for all Machines of a management -cluster or for a subset of these Machines without having to configure each Machine individually. I would also like +As a cluster admin/user, I would like to be able to configure the drain behavior for all Machines of a management +cluster or for a subset of these Machines without having to configure each Machine individually. I would also like to be able to change the drain configuration without having to modify all Machine objects. ### Goals @@ -114,15 +114,15 @@ to be able to change the drain configuration without having to modify all Machin ### Future work -* Align with the evolving landscape for Node drain in Kubernetes as soon as the new features are available, +* Align with the evolving landscape for Node drain in Kubernetes as soon as the new features are available, stable and can be used for the Kubernetes versions Cluster API supports (e.g. [KEP-4212](https://github.com/kubernetes/enhancements/issues/4212), [KEP-4563](https://github.com/kubernetes/enhancements/issues/4563)). - Because of the wide range of workload cluster Kubernetes versions supported in Cluster API (n-5), it will take + Because of the wide range of workload cluster Kubernetes versions supported in Cluster API (n-5), it will take significant time until we can use new Kubernetes features in all supported versions. So an implementation in Cluster API is the only option to improve drain anytime soon (even for the oldest supported Kubernetes versions). - Also, please note that while we would like to align with Kubernetes and leverage its new features as soon as - possible, there will always be a delta to be addressed in Cluster API due to the fact that Cluster API is operated + Also, please note that while we would like to align with Kubernetes and leverage its new features as soon as + possible, there will always be a delta to be addressed in Cluster API due to the fact that Cluster API is operated from the management cluster, while drain is done in workload clusters. Additionally, Cluster API should take care of many clusters, while Kubernetes features are by design scoped to a single cluster. @@ -133,10 +133,10 @@ to be able to change the drain configuration without having to modify all Machin #### `MachineDrainRule` CRD -We propose to introduce a new `MachineDrainRule` namespace-scoped CRD. The goal of this CRD is to allow management +We propose to introduce a new `MachineDrainRule` namespace-scoped CRD. The goal of this CRD is to allow management cluster admins/users to define a drain rule (a specific drain behavior) for a set of Pods. -Additionally, drain rules can be restricted to a specific set of Machines, e.g. targeting only controlplane Machines, +Additionally, drain rules can be restricted to a specific set of Machines, e.g. targeting only controlplane Machines, Machines of specific Clusters, Linux Machines, etc. ```yaml @@ -147,8 +147,8 @@ metadata: namespace: default spec: drain: - # behavior can be Drain or Skip - behavior: Drain + # behavior can be Drain, Skip, or WaitCompleted + behavior: Drain # order can only be set if behavior == Drain # Pods with higher order are drained later order: 100 @@ -173,8 +173,8 @@ Notes: * The Machine controller will evaluate `machines` and `pods` selectors to determine for which Machine and to which Pods the rule should be applied. * All selectors are of type `metav1.LabelSelector` -* In order to allow expressing selectors with AND and OR predicates, both `machines` and `pods` have a list of - selectors instead of a single selector. Within a single element of the selector lists AND is used, while between multiple +* In order to allow expressing selectors with AND and OR predicates, both `machines` and `pods` have a list of + selectors instead of a single selector. Within a single element of the selector lists AND is used, while between multiple entries in the selector lists OR is used. ##### Example: Exclude Pods from drain @@ -191,8 +191,8 @@ spec: machines: - selector: {} pods: - # This selects all Pods with the app label in (example-app1,example-app2) AND in the example-namespace - - selector: + # This selects all Pods with the app label in (example-app1,example-app2) AND in the example-namespace + - selector: matchExpressions: - key: app operator: In @@ -238,15 +238,16 @@ spec: #### Drain labels -We propose to introduce new labels to allow workload cluster admins/users to define drain behavior +We propose to introduce new labels to allow workload cluster admins/users to define drain behavior for Pods. These labels would be either added directly to Pods or indirectly via Deployments, StatefulSets, etc. The labels will take precedence over `MachineDrainRules` specified in the management cluster. * `cluster.x-k8s.io/drain: skip` +* `cluster.x-k8s.io/drain: wait-completed` -Initially we also considered adding a `cluster.x-k8s.io/drain-order` label. But we're not entirely sure about it -yet as someone who has access to the workload cluster (or maybe only specific Pods) would be able to influence the -drain order of the entire cluster, which might lead to problems. The skip label is safe in comparison because it +Initially we also considered adding a `cluster.x-k8s.io/drain-order` label. But we're not entirely sure about it +yet as someone who has access to the workload cluster (or maybe only specific Pods) would be able to influence the +drain order of the entire cluster, which might lead to problems. The skip label is safe in comparison because it only influences the drain behavior of the Pod that has the label. #### Node drain behavior @@ -259,6 +260,8 @@ The following describes the new algorithm that decides which Pods should be drai * \=\> use `behavior: Skip` * If the Pod has `cluster.x-k8s.io/drain: skip` * \=\> use `behavior: Skip` + * If the Pod has `cluster.x-k8s.io/drain: wait-completed` + * \=\> use `behavior: WaitCompleted` * If there is a matching `MachineDrainRule` * \=\> use `behavior` and `order` from the first matching `MachineDrainRule` (based on alphabetical order) * Otherwise: @@ -270,21 +273,21 @@ The following describes the new algorithm that decides which Pods should be drai Notes: -* It does not make sense to evict DaemonSet Pods, because the DaemonSet controller always [adds tolerations for the +* It does not make sense to evict DaemonSet Pods, because the DaemonSet controller always [adds tolerations for the unschedulable taint to all its Pods](https://kubernetes.io/docs/concepts/workloads/controllers/daemonset/#taints-and-tolerations) so they would be just immediately rescheduled. -* It does not make sense to evict static Pods, because the kubelet won’t actually terminate them and the Pod object +* It does not make sense to evict static Pods, because the kubelet won’t actually terminate them and the Pod object will get recreated. * If multiple `MachineDrainRules` match the same Pod, the “first” rule (based on alphabetical order) will be applied * Alphabetical order is used because: - * There is no way to ensure only a single `MachineDrainRule` applies to a specific Pod, because not all Pods and their - labels are known when `MachineDrainRules` are created/updated especially as the Pods will also change over time. + * There is no way to ensure only a single `MachineDrainRule` applies to a specific Pod, because not all Pods and their + labels are known when `MachineDrainRules` are created/updated especially as the Pods will also change over time. * So a criteria is needed to decide which `MachineDrainRule` is used when multiple `MachineDrainRules` apply * For now, we decided to use simple alphabetical order. An alternative would have been to add a `priority` field to `MachineDrainRules` and enforce that each priority is used only once, but this is not entirely race condition free (when two `MachineDrainRules` are created "at the same time") and adding "another" priority field on top of `order` can be confusing to users. -* `Pods` with drain behavior `Drain` and given order will be evicted only after all other `Pods` with a lower order +* `Pods` with drain behavior `Drain` and given order will be evicted only after all other `Pods` with a lower order have been already deleted * When waiting for volume detach, volumes belonging to `Pods` with drain behavior `Skip` are going to be ignored. Or in other words, Machine deletion can continue if only volumes belonging to `Pods` with drain behavior `Skip` @@ -295,7 +298,7 @@ Notes: Today, after Node drain we are waiting for **all** volumes to be detached. We are going to change that behavior to ignore all attached volumes that belong to Pods for which we skipped the drain. -Please note, today the only Pods for which we skip drain that can have volumes are DaemonSet Pods. If a DaemonSet Pod +Please note, today the only Pods for which we skip drain that can have volumes are DaemonSet Pods. If a DaemonSet Pod has a volume currently wait for volume detach would block indefinitely. The only way around this today is to set either the `Machine.spec.nodeVolumeDetachTimeout` field or the `machine.cluster.x-k8s.io/exclude-wait-for-node-volume-detach` annotation. With this change we will stop waiting for volumes of DaemonSet Pods to be detached. @@ -320,25 +323,25 @@ Cluster CRs), it is also possible to further restrict access. **Add Node drain configuration to the Machine object** -We considered adding the drain rules directly to the Machine objects (and thus also to the MachineDeployment & MachineSet objects) -instead. We discarded this option because it would have made it necessary to configure Node drain on every single Machine. -By having a separate CRD it is now possible to configure the Node drain for all Clusters / Machines or a specific subset -of Machines at once. This also means that the Node drain configuration can be immediately changed without having to propagate +We considered adding the drain rules directly to the Machine objects (and thus also to the MachineDeployment & MachineSet objects) +instead. We discarded this option because it would have made it necessary to configure Node drain on every single Machine. +By having a separate CRD it is now possible to configure the Node drain for all Clusters / Machines or a specific subset +of Machines at once. This also means that the Node drain configuration can be immediately changed without having to propagate configuration changes to all Machines. **MachineDrainRequestTemplates** -We considered defining separate MachineDrainRequest and MachineDrainRequestTemplate CRDs. The MachineDrainRequestTemplates -would be used to define the drain behavior. When the Machine controller starts a drain a MachineDrainRequest would be -created based on the applicable MachineDrainRequestTemplates. The MachineDrainRequest would then be used to reconcile -and track the drain. Downside would be that if you have a lot of templates, you make a really large custom resource -that is unwieldy to manage. Additionally it is not clear if it's necessary to have a separate MachineDrainRequest CRD, +We considered defining separate MachineDrainRequest and MachineDrainRequestTemplate CRDs. The MachineDrainRequestTemplates +would be used to define the drain behavior. When the Machine controller starts a drain a MachineDrainRequest would be +created based on the applicable MachineDrainRequestTemplates. The MachineDrainRequest would then be used to reconcile +and track the drain. Downside would be that if you have a lot of templates, you make a really large custom resource +that is unwieldy to manage. Additionally it is not clear if it's necessary to have a separate MachineDrainRequest CRD, it's probably easier to track the drain directly on the Machine object. ## Upgrade Strategy -`MachineDrainRules` are orthogonal to the state of the Cluster / Machines as they only configure how the Machine controller -should drain Nodes. Accordingly, they are not part of the Machine specs. Thus, as soon as the new Cluster API version that -supports this feature is deployed, `MachineDrainRules` can be immediately used without rolling out / re-configuring any +`MachineDrainRules` are orthogonal to the state of the Cluster / Machines as they only configure how the Machine controller +should drain Nodes. Accordingly, they are not part of the Machine specs. Thus, as soon as the new Cluster API version that +supports this feature is deployed, `MachineDrainRules` can be immediately used without rolling out / re-configuring any Clusters / Machines. Please note that while the drain behavior of DaemonSet Pods won't change (we’ll continue to skip draining), @@ -354,8 +357,8 @@ we will stop waiting for detachment of volumes of DaemonSet Pods. ### Graduation Criteria -The `MachineDrainRules` CRD will be added as `v1beta1` CRD to the `cluster.x-k8s.io` apiGroup. -An additional feature gate is not required as the behavior of the Machine controller will stay the same if neither +The `MachineDrainRules` CRD will be added as `v1beta1` CRD to the `cluster.x-k8s.io` apiGroup. +An additional feature gate is not required as the behavior of the Machine controller will stay the same if neither `MachineDrainRule` CRDs nor the labels are used. ### Version Skew Strategy diff --git a/internal/controllers/machine/drain/drain.go b/internal/controllers/machine/drain/drain.go index 354d6504ea7d..c6e5cd9e8c4d 100644 --- a/internal/controllers/machine/drain/drain.go +++ b/internal/controllers/machine/drain/drain.go @@ -128,7 +128,7 @@ func (d *Helper) GetPodsForEviction(ctx context.Context, cluster *clusterv1.Clus podNamespaces[ns.Name] = &ns } - // Note: As soon as a filter decides that a Pod should be skipped (i.e. DrainBehavior == "Skip") + // Note: As soon as a filter decides that a Pod should be skipped (i.e. DrainBehavior == "Skip" or "WaitCompleted") // other filters won't be evaluated and the Pod will be skipped. list := filterPods(ctx, allPods, []PodFilter{ // Phase 1: Basic filtering (aligned to kubectl drain) @@ -150,7 +150,7 @@ func (d *Helper) GetPodsForEviction(ctx context.Context, cluster *clusterv1.Clus // Phase 2: Filtering based on drain label & MachineDrainRules - // Skip Pods with label cluster.x-k8s.io/drain == "skip" + // Skip Pods with label cluster.x-k8s.io/drain == "skip" or "wait-completed" d.drainLabelFilter, // Use drain behavior and order from first matching MachineDrainRule @@ -225,7 +225,8 @@ func filterPods(ctx context.Context, allPods []*corev1.Pod, filters []PodFilter) var deleteWarnings []string for _, filter := range filters { status = filter(ctx, pod) - if status.DrainBehavior == clusterv1.MachineDrainRuleDrainBehaviorSkip { + if status.DrainBehavior == clusterv1.MachineDrainRuleDrainBehaviorSkip || + status.DrainBehavior == clusterv1.MachineDrainRuleDrainBehaviorWaitCompleted { // short-circuit as soon as pod is filtered out // at that point, there is no reason to run pod // through any additional filters @@ -281,6 +282,7 @@ func (d *Helper) EvictPods(ctx context.Context, podDeleteList *PodDeleteList) Ev var podsToTriggerEvictionLater []PodDelete var podsWithDeletionTimestamp []PodDelete var podsToBeIgnored []PodDelete + var podsToWait []PodDelete for _, pod := range podDeleteList.items { switch { case pod.Status.DrainBehavior == clusterv1.MachineDrainRuleDrainBehaviorDrain && pod.Pod.DeletionTimestamp.IsZero(): @@ -289,6 +291,8 @@ func (d *Helper) EvictPods(ctx context.Context, podDeleteList *PodDeleteList) Ev } else { podsToTriggerEvictionLater = append(podsToTriggerEvictionLater, pod) } + case pod.Status.DrainBehavior == clusterv1.MachineDrainRuleDrainBehaviorWaitCompleted: + podsToWait = append(podsToWait, pod) case pod.Status.DrainBehavior == clusterv1.MachineDrainRuleDrainBehaviorDrain: podsWithDeletionTimestamp = append(podsWithDeletionTimestamp, pod) default: @@ -300,6 +304,7 @@ func (d *Helper) EvictPods(ctx context.Context, podDeleteList *PodDeleteList) Ev "podsToTriggerEvictionNow", podDeleteListToString(podsToTriggerEvictionNow, 5), "podsToTriggerEvictionLater", podDeleteListToString(podsToTriggerEvictionLater, 5), "podsWithDeletionTimestamp", podDeleteListToString(podsWithDeletionTimestamp, 5), + "podsToWaitCompleted", podDeleteListToString(podsToWait, 5), ) // Trigger evictions for at most 10s. We'll continue on the next reconcile if we hit the timeout. @@ -394,6 +399,10 @@ evictionLoop: res.PodsToTriggerEvictionLater = append(res.PodsToTriggerEvictionLater, pd.Pod) } + for _, pd := range podsToWait { + res.PodsToWaitCompleted = append(res.PodsToWaitCompleted, pd.Pod) + } + return res } @@ -431,13 +440,15 @@ type EvictionResult struct { PodsDeletionTimestampSet []*corev1.Pod PodsFailedEviction map[string][]*corev1.Pod PodsToTriggerEvictionLater []*corev1.Pod + PodsToWaitCompleted []*corev1.Pod PodsNotFound []*corev1.Pod PodsIgnored []*corev1.Pod } // DrainCompleted returns if a Node is entirely drained, i.e. if all relevant Pods have gone away. func (r EvictionResult) DrainCompleted() bool { - return len(r.PodsDeletionTimestampSet) == 0 && len(r.PodsFailedEviction) == 0 && len(r.PodsToTriggerEvictionLater) == 0 + return len(r.PodsDeletionTimestampSet) == 0 && len(r.PodsFailedEviction) == 0 && + len(r.PodsToTriggerEvictionLater) == 0 && len(r.PodsToWaitCompleted) == 0 } // ConditionMessage returns a condition message for the case where a drain is not completed. @@ -498,6 +509,10 @@ func (r EvictionResult) ConditionMessage(nodeDrainStartTime *metav1.Time) string conditionMessage = fmt.Sprintf("%s\nAfter above Pods have been removed from the Node, the following Pods will be evicted: %s", conditionMessage, PodListToString(r.PodsToTriggerEvictionLater, 3)) } + if len(r.PodsToWaitCompleted) > 0 { + conditionMessage = fmt.Sprintf("%s\nWaiting for the following Pods to complete without eviction: %s", + conditionMessage, PodListToString(r.PodsToWaitCompleted, 3)) + } return conditionMessage } diff --git a/internal/controllers/machine/drain/drain_test.go b/internal/controllers/machine/drain/drain_test.go index 7852601ac9f9..dd60f18d988e 100644 --- a/internal/controllers/machine/drain/drain_test.go +++ b/internal/controllers/machine/drain/drain_test.go @@ -140,6 +140,32 @@ func TestGetPodsForEviction(t *testing.T) { }, }, } + mdrBehaviorWaitCompleted := &clusterv1.MachineDrainRule{ + ObjectMeta: metav1.ObjectMeta{ + Name: "mdr-behavior-wait-completed", + Namespace: "test-namespace", + }, + Spec: clusterv1.MachineDrainRuleSpec{ + Drain: clusterv1.MachineDrainRuleDrainConfig{ + Behavior: clusterv1.MachineDrainRuleDrainBehaviorWaitCompleted, + }, + Machines: nil, // Match all machines + Pods: []clusterv1.MachineDrainRulePodSelector{ + { + Selector: &metav1.LabelSelector{ + MatchLabels: map[string]string{ + "app": "behavior-wait-completed", + }, + }, + NamespaceSelector: &metav1.LabelSelector{ + MatchLabels: map[string]string{ + "kubernetes.io/metadata.name": "test-namespace", + }, + }, + }, + }, + }, + } mdrBehaviorUnknown := &clusterv1.MachineDrainRule{ ObjectMeta: metav1.ObjectMeta{ Name: "mdr-behavior-unknown", @@ -637,6 +663,15 @@ func TestGetPodsForEviction(t *testing.T) { }, }, }, + { + ObjectMeta: metav1.ObjectMeta{ + Name: "pod-3-skip-pod-with-drain-label-wait-completed", + Namespace: metav1.NamespaceDefault, + Labels: map[string]string{ + clusterv1.PodDrainLabel: "wait-completed", + }, + }, + }, }, wantPodDeleteList: PodDeleteList{items: []PodDelete{ { @@ -663,6 +698,18 @@ func TestGetPodsForEviction(t *testing.T) { Reason: PodDeleteStatusTypeSkip, }, }, + { + Pod: &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: "pod-3-skip-pod-with-drain-label-wait-completed", + Namespace: metav1.NamespaceDefault, + }, + }, + Status: PodDeleteStatus{ + DrainBehavior: clusterv1.MachineDrainRuleDrainBehaviorWaitCompleted, + Reason: PodDeleteStatusTypeWaitCompleted, + }, + }, }}, }, { @@ -714,8 +761,17 @@ func TestGetPodsForEviction(t *testing.T) { }, }, }, + { + ObjectMeta: metav1.ObjectMeta{ + Name: "pod-3-behavior-wait-completed", + Namespace: "test-namespace", // matches the Namespace of the selector in mdrBehaviorWaitCompleted. + Labels: map[string]string{ + "app": "behavior-wait-completed", // matches mdrBehaviorWaitCompleted. + }, + }, + }, }, - machineDrainRules: []*clusterv1.MachineDrainRule{mdrBehaviorDrain, mdrBehaviorSkip, mdrBehaviorUnknown}, + machineDrainRules: []*clusterv1.MachineDrainRule{mdrBehaviorDrain, mdrBehaviorSkip, mdrBehaviorUnknown, mdrBehaviorWaitCompleted}, wantPodDeleteList: PodDeleteList{items: []PodDelete{ { Pod: &corev1.Pod{ @@ -744,6 +800,18 @@ func TestGetPodsForEviction(t *testing.T) { Reason: PodDeleteStatusTypeSkip, }, }, + { + Pod: &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: "pod-3-behavior-wait-completed", + Namespace: "test-namespace", + }, + }, + Status: PodDeleteStatus{ + DrainBehavior: clusterv1.MachineDrainRuleDrainBehaviorWaitCompleted, + Reason: PodDeleteStatusTypeWaitCompleted, + }, + }, }}, }, } @@ -918,6 +986,19 @@ func Test_getMatchingMachineDrainRules(t *testing.T) { Pods: nil, // Match all Pods }, } + matchingMDRBehaviorWaitCompleted := &clusterv1.MachineDrainRule{ + ObjectMeta: metav1.ObjectMeta{ + Name: "mdr-behavior-wait-completed", + Namespace: "test-namespace", + }, + Spec: clusterv1.MachineDrainRuleSpec{ + Drain: clusterv1.MachineDrainRuleDrainConfig{ + Behavior: clusterv1.MachineDrainRuleDrainBehaviorWaitCompleted, + }, + Machines: nil, // Match all machines + Pods: nil, // Match all Pods + }, + } matchingMDRBehaviorUnknown := &clusterv1.MachineDrainRule{ ObjectMeta: metav1.ObjectMeta{ Name: "mdr-behavior-unknown", @@ -986,6 +1067,7 @@ func Test_getMatchingMachineDrainRules(t *testing.T) { matchingMDRBehaviorDrainB, matchingMDRBehaviorDrainA, matchingMDRBehaviorSkip, + matchingMDRBehaviorWaitCompleted, matchingMDRBehaviorUnknown, notMatchingMDRDifferentNamespace, notMatchingMDRNotMatchingSelector, @@ -995,6 +1077,7 @@ func Test_getMatchingMachineDrainRules(t *testing.T) { matchingMDRBehaviorDrainB, matchingMDRBehaviorSkip, matchingMDRBehaviorUnknown, + matchingMDRBehaviorWaitCompleted, }, }, } @@ -1291,6 +1374,17 @@ func TestEvictPods(t *testing.T) { Reason: PodDeleteStatusTypeOkay, }, }, + { + Pod: &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: "pod-9-wait-completed", + }, + }, + Status: PodDeleteStatus{ + DrainBehavior: clusterv1.MachineDrainRuleDrainBehaviorWaitCompleted, // Will be skipped because DrainBehavior is set to WaitCompleted + Reason: PodDeleteStatusTypeWaitCompleted, + }, + }, }}, wantEvictionResult: EvictionResult{ PodsIgnored: []*corev1.Pod{ @@ -1354,6 +1448,13 @@ func TestEvictPods(t *testing.T) { }, }, }, + PodsToWaitCompleted: []*corev1.Pod{ + { + ObjectMeta: metav1.ObjectMeta{ + Name: "pod-9-wait-completed", + }, + }, + }, }, }, } @@ -1651,6 +1752,13 @@ After above Pods have been removed from the Node, the following Pods will be evi }, }, }, + PodsToWaitCompleted: []*corev1.Pod{ + { + ObjectMeta: metav1.ObjectMeta{ + Name: "pod-16-wait-completed", + }, + }, + }, }, wantConditionMessage: `Drain not completed yet (started at 2024-10-09T16:13:59Z): * Pods pod-2-deletionTimestamp-set-1, pod-2-deletionTimestamp-set-2, pod-2-deletionTimestamp-set-3, ... (4 more): deletionTimestamp set, but still not removed from the Node @@ -1660,7 +1768,8 @@ After above Pods have been removed from the Node, the following Pods will be evi * Pod pod-8-to-trigger-eviction-some-other-error: failed to evict Pod, some other error 3 * Pod pod-9-to-trigger-eviction-some-other-error: failed to evict Pod, some other error 4 * 1 Pod with other issues -After above Pods have been removed from the Node, the following Pods will be evicted: pod-11-eviction-later, pod-12-eviction-later, pod-13-eviction-later, ... (2 more)`, +After above Pods have been removed from the Node, the following Pods will be evicted: pod-11-eviction-later, pod-12-eviction-later, pod-13-eviction-later, ... (2 more) +Waiting for the following Pods to complete without eviction: pod-16-wait-completed`, }, { name: "Compute long condition message correctly with more skipped errors", diff --git a/internal/controllers/machine/drain/filters.go b/internal/controllers/machine/drain/filters.go index a92f850bd1c4..5c876e28aeb4 100644 --- a/internal/controllers/machine/drain/filters.go +++ b/internal/controllers/machine/drain/filters.go @@ -61,7 +61,8 @@ type PodDeleteList struct { func (l *PodDeleteList) Pods() []*corev1.Pod { pods := []*corev1.Pod{} for _, i := range l.items { - if i.Status.DrainBehavior == clusterv1.MachineDrainRuleDrainBehaviorDrain { + if i.Status.DrainBehavior == clusterv1.MachineDrainRuleDrainBehaviorDrain || + i.Status.DrainBehavior == clusterv1.MachineDrainRuleDrainBehaviorWaitCompleted { pods = append(pods, i.Pod) } } @@ -124,6 +125,8 @@ const ( PodDeleteStatusTypeOkay = "Okay" // PodDeleteStatusTypeSkip is "Skip". PodDeleteStatusTypeSkip = "Skip" + // PodDeleteStatusTypeWaitCompleted is "WaitCompleted". + PodDeleteStatusTypeWaitCompleted = "WaitCompleted" // PodDeleteStatusTypeWarning is "Warning". PodDeleteStatusTypeWarning = "Warning" // PodDeleteStatusTypeError is "Error". @@ -156,6 +159,14 @@ func MakePodDeleteStatusSkip() PodDeleteStatus { } } +// MakePodDeleteStatusWaitCompleted is a helper method to return the corresponding PodDeleteStatus. +func MakePodDeleteStatusWaitCompleted() PodDeleteStatus { + return PodDeleteStatus{ + DrainBehavior: clusterv1.MachineDrainRuleDrainBehaviorWaitCompleted, + Reason: PodDeleteStatusTypeWaitCompleted, + } +} + // MakePodDeleteStatusWithWarning is a helper method to return the corresponding PodDeleteStatus. func MakePodDeleteStatusWithWarning(behavior clusterv1.MachineDrainRuleDrainBehavior, message string) PodDeleteStatus { var order *int32 @@ -272,9 +283,15 @@ func (d *Helper) skipDeletedFilter(ctx context.Context, pod *corev1.Pod) PodDele func (d *Helper) drainLabelFilter(ctx context.Context, pod *corev1.Pod) PodDeleteStatus { if labelValue, found := pod.ObjectMeta.Labels[clusterv1.PodDrainLabel]; found && strings.EqualFold(labelValue, string(clusterv1.MachineDrainRuleDrainBehaviorSkip)) { log := ctrl.LoggerFrom(ctx, "Pod", klog.KObj(pod)) - log.V(4).Info(fmt.Sprintf("Skip evicting Pod, because Pod has %s label", clusterv1.PodDrainLabel)) + log.V(4).Info(fmt.Sprintf("Skip evicting Pod, because Pod has %s label with %s value", clusterv1.PodDrainLabel, labelValue)) return MakePodDeleteStatusSkip() } + if labelValue, found := pod.ObjectMeta.Labels[clusterv1.PodDrainLabel]; found && + strings.EqualFold(strings.Replace(labelValue, "-", "", 1), string(clusterv1.MachineDrainRuleDrainBehaviorWaitCompleted)) { + log := ctrl.LoggerFrom(ctx, "Pod", klog.KObj(pod)) + log.V(4).Info(fmt.Sprintf("Skip evicting Pod, because Pod has %s label with %s value", clusterv1.PodDrainLabel, labelValue)) + return MakePodDeleteStatusWaitCompleted() + } return MakePodDeleteStatusOkay() } @@ -293,13 +310,16 @@ func (d *Helper) machineDrainRulesFilter(machineDrainRules []*clusterv1.MachineD } // If the pod selector matches, use the drain behavior from the MachineDrainRule. + log := ctrl.LoggerFrom(ctx, "Pod", klog.KObj(pod)) switch mdr.Spec.Drain.Behavior { case clusterv1.MachineDrainRuleDrainBehaviorDrain: return MakePodDeleteStatusOkayWithOrder(mdr.Spec.Drain.Order) case clusterv1.MachineDrainRuleDrainBehaviorSkip: - log := ctrl.LoggerFrom(ctx, "Pod", klog.KObj(pod)) log.V(4).Info(fmt.Sprintf("Skip evicting Pod, because MachineDrainRule %s with behavior %s applies to the Pod", mdr.Name, clusterv1.MachineDrainRuleDrainBehaviorSkip)) return MakePodDeleteStatusSkip() + case clusterv1.MachineDrainRuleDrainBehaviorWaitCompleted: + log.V(4).Info(fmt.Sprintf("Skip evicting Pod, because MachineDrainRule %s with behavior %s applies to the Pod", mdr.Name, clusterv1.MachineDrainRuleDrainBehaviorWaitCompleted)) + return MakePodDeleteStatusWaitCompleted() default: return MakePodDeleteStatusWithError( fmt.Sprintf("MachineDrainRule %q has unknown spec.drain.behavior: %q", diff --git a/internal/controllers/machine/machine_controller.go b/internal/controllers/machine/machine_controller.go index a296365786d4..98e0cbc045e2 100644 --- a/internal/controllers/machine/machine_controller.go +++ b/internal/controllers/machine/machine_controller.go @@ -879,7 +879,6 @@ func (r *Reconciler) drainNode(ctx context.Context, s *scope) (ctrl.Result, erro } podsToBeDrained := podDeleteList.Pods() - if len(podsToBeDrained) == 0 { log.Info("Drain completed") return ctrl.Result{}, nil @@ -909,6 +908,7 @@ func (r *Reconciler) drainNode(ctx context.Context, s *scope) (ctrl.Result, erro "podsFailedEviction", drain.PodListToString(podsFailedEviction, 5), "podsWithDeletionTimestamp", drain.PodListToString(evictionResult.PodsDeletionTimestampSet, 5), "podsToTriggerEvictionLater", drain.PodListToString(evictionResult.PodsToTriggerEvictionLater, 5), + "podsToWaitCompleted", drain.PodListToString(evictionResult.PodsToWaitCompleted, 5), ) return ctrl.Result{RequeueAfter: drainRetryInterval}, nil }