diff --git a/components/ws-manager-api/go/config/config.go b/components/ws-manager-api/go/config/config.go index 62a715ba0a8482..36b8e8b6a83540 100644 --- a/components/ws-manager-api/go/config/config.go +++ b/components/ws-manager-api/go/config/config.go @@ -142,6 +142,11 @@ type Configuration struct { // SSHGatewayCAPublicKey is a CA public key SSHGatewayCAPublicKey string + + // PodRecreationMaxRetries + PodRecreationMaxRetries int `json:"podRecreationMaxRetries,omitempty"` + // PodRecreationBackoff + PodRecreationBackoff util.Duration `json:"podRecreationBackoff,omitempty"` } type WorkspaceClass struct { diff --git a/components/ws-manager-api/go/crd/v1/workspace_types.go b/components/ws-manager-api/go/crd/v1/workspace_types.go index b1af101a6f96c6..c27cc118c3e03f 100644 --- a/components/ws-manager-api/go/crd/v1/workspace_types.go +++ b/components/ws-manager-api/go/crd/v1/workspace_types.go @@ -170,9 +170,10 @@ func (ps PortSpec) Equal(other PortSpec) bool { // WorkspaceStatus defines the observed state of Workspace type WorkspaceStatus struct { - PodStarts int `json:"podStarts"` - URL string `json:"url,omitempty" scrub:"redact"` - OwnerToken string `json:"ownerToken,omitempty" scrub:"redact"` + PodStarts int `json:"podStarts"` + PodRecreated int `json:"podRecreated"` + URL string `json:"url,omitempty" scrub:"redact"` + OwnerToken string `json:"ownerToken,omitempty" scrub:"redact"` // +kubebuilder:default=Unknown Phase WorkspacePhase `json:"phase,omitempty"` @@ -263,6 +264,9 @@ const ( // WorkspaceContainerRunning is true if the workspace container is running. // Used to determine if a backup can be taken, only once the container is stopped. WorkspaceConditionContainerRunning WorkspaceCondition = "WorkspaceContainerRunning" + + // WorkspaceConditionPodRejected is true if we detected that the pod was rejected by the node + WorkspaceConditionPodRejected WorkspaceCondition = "PodRejected" ) func NewWorkspaceConditionDeployed() metav1.Condition { @@ -291,6 +295,15 @@ func NewWorkspaceConditionFailed(message string) metav1.Condition { } } +func NewWorkspaceConditionPodRejected(message string, status metav1.ConditionStatus) metav1.Condition { + return metav1.Condition{ + Type: string(WorkspaceConditionPodRejected), + LastTransitionTime: metav1.Now(), + Status: status, + Message: message, + } +} + func NewWorkspaceConditionTimeout(message string) metav1.Condition { return metav1.Condition{ Type: string(WorkspaceConditionTimeout), diff --git a/components/ws-manager-mk2/config/crd/bases/workspace.gitpod.io_workspaces.yaml b/components/ws-manager-mk2/config/crd/bases/workspace.gitpod.io_workspaces.yaml index 72dbd0d7179c95..016a04232c916f 100644 --- a/components/ws-manager-mk2/config/crd/bases/workspace.gitpod.io_workspaces.yaml +++ b/components/ws-manager-mk2/config/crd/bases/workspace.gitpod.io_workspaces.yaml @@ -543,6 +543,8 @@ spec: type: string podStarts: type: integer + podRecreated: + type: integer runtime: properties: hostIP: diff --git a/components/ws-manager-mk2/controllers/metrics.go b/components/ws-manager-mk2/controllers/metrics.go index 72f1eb1ace246f..f182a5f4c2c976 100644 --- a/components/ws-manager-mk2/controllers/metrics.go +++ b/components/ws-manager-mk2/controllers/metrics.go @@ -30,6 +30,7 @@ const ( workspaceStartFailuresTotal string = "workspace_starts_failure_total" workspaceFailuresTotal string = "workspace_failure_total" workspaceStopsTotal string = "workspace_stops_total" + workspaceRecreationsTotal string = "workspace_recreations_total" workspaceBackupsTotal string = "workspace_backups_total" workspaceBackupFailuresTotal string = "workspace_backups_failure_total" workspaceRestoresTotal string = "workspace_restores_total" @@ -57,6 +58,7 @@ type controllerMetrics struct { totalStartsFailureCounterVec *prometheus.CounterVec totalFailuresCounterVec *prometheus.CounterVec totalStopsCounterVec *prometheus.CounterVec + totalRecreationsCounterVec *prometheus.CounterVec totalBackupCounterVec *prometheus.CounterVec totalBackupFailureCounterVec *prometheus.CounterVec @@ -120,6 +122,12 @@ func newControllerMetrics(r *WorkspaceReconciler) (*controllerMetrics, error) { Name: workspaceStopsTotal, Help: "total number of workspaces stopped", }, []string{"reason", "type", "class"}), + totalRecreationsCounterVec: prometheus.NewCounterVec(prometheus.CounterOpts{ + Namespace: metricsNamespace, + Subsystem: metricsWorkspaceSubsystem, + Name: workspaceRecreationsTotal, + Help: "total number of workspace recreations", + }, []string{"type", "class", "attempt"}), totalBackupCounterVec: prometheus.NewCounterVec(prometheus.CounterOpts{ Namespace: metricsNamespace, @@ -233,6 +241,14 @@ func (m *controllerMetrics) countWorkspaceStop(log *logr.Logger, ws *workspacev1 m.totalStopsCounterVec.WithLabelValues(reason, tpe, class).Inc() } +func (m *controllerMetrics) countWorkspaceRecreations(log *logr.Logger, ws *workspacev1.Workspace) { + class := ws.Spec.Class + tpe := string(ws.Spec.Type) + attempt := fmt.Sprint(ws.Status.PodRecreated) + + m.totalRecreationsCounterVec.WithLabelValues(tpe, class, attempt).Inc() +} + func (m *controllerMetrics) countTotalBackups(log *logr.Logger, ws *workspacev1.Workspace) { class := ws.Spec.Class tpe := string(ws.Spec.Type) @@ -291,6 +307,7 @@ type metricState struct { recordedContentReady bool recordedBackupFailed bool recordedBackupCompleted bool + recordedRecreations int } func newMetricState(ws *workspacev1.Workspace) metricState { @@ -306,6 +323,7 @@ func newMetricState(ws *workspacev1.Workspace) metricState { recordedContentReady: ws.IsConditionTrue(workspacev1.WorkspaceConditionContentReady), recordedBackupFailed: ws.IsConditionTrue(workspacev1.WorkspaceConditionBackupFailure), recordedBackupCompleted: ws.IsConditionTrue(workspacev1.WorkspaceConditionBackupComplete), + recordedRecreations: ws.Status.PodRecreated, } } diff --git a/components/ws-manager-mk2/controllers/status.go b/components/ws-manager-mk2/controllers/status.go index f495dfde1ccf7f..a219d4d8b13f2f 100644 --- a/components/ws-manager-mk2/controllers/status.go +++ b/components/ws-manager-mk2/controllers/status.go @@ -123,6 +123,16 @@ func (r *WorkspaceReconciler) updateWorkspaceStatus(ctx context.Context, workspa workspace.Status.Phase = *phase } + if failure != "" && !workspace.IsConditionTrue(workspacev1.WorkspaceConditionFailed) { + // Check: A situation where we want to retry? + if pod.Status.Phase == corev1.PodFailed && (pod.Status.Reason == "NodeAffinity" || pod.Status.Reason == "OutOfCPU") && strings.HasPrefix(pod.Status.Message, "Pod was rejected") { + // This is a situation where we want to re-create the pod! + log.Info("workspace scheduling failed", "workspace", workspace.Name, "reason", failure) + workspace.Status.SetCondition(workspacev1.NewWorkspaceConditionPodRejected(failure, metav1.ConditionTrue)) + r.Recorder.Event(workspace, corev1.EventTypeWarning, "PodRejected", failure) + } + } + if failure != "" && !workspace.IsConditionTrue(workspacev1.WorkspaceConditionFailed) { // workspaces can fail only once - once there is a failed condition set, stick with it log.Info("workspace failed", "workspace", workspace.Name, "reason", failure) diff --git a/components/ws-manager-mk2/controllers/subscriber_controller.go b/components/ws-manager-mk2/controllers/subscriber_controller.go index 0349d28e028cf0..70a0ce27c3bfe9 100644 --- a/components/ws-manager-mk2/controllers/subscriber_controller.go +++ b/components/ws-manager-mk2/controllers/subscriber_controller.go @@ -61,6 +61,12 @@ func (r *SubscriberReconciler) Reconcile(ctx context.Context, req ctrl.Request) workspace.Status.Conditions = []metav1.Condition{} } + if workspace.IsConditionTrue(workspacev1.WorkspaceConditionPodRejected) { + // In this situation, we are about to re-create the pod. We don't want clients to see all the "stopping, stopped, starting" chatter, so we hide it here. + // TODO(gpl) Is this a sane approach? + return ctrl.Result{}, nil + } + if r.OnReconcile != nil { r.OnReconcile(ctx, &workspace) } diff --git a/components/ws-manager-mk2/controllers/workspace_controller.go b/components/ws-manager-mk2/controllers/workspace_controller.go index 55301df5072b18..d6f05067aef694 100644 --- a/components/ws-manager-mk2/controllers/workspace_controller.go +++ b/components/ws-manager-mk2/controllers/workspace_controller.go @@ -180,7 +180,7 @@ func (r *WorkspaceReconciler) actOnStatus(ctx context.Context, workspace *worksp if len(workspacePods.Items) == 0 { // if there isn't a workspace pod and we're not currently deleting this workspace,// create one. switch { - case workspace.Status.PodStarts == 0: + case workspace.Status.PodStarts == 0 || workspace.Status.PodStarts-workspace.Status.PodRecreated < 1: sctx, err := newStartWorkspaceContext(ctx, r.Config, workspace) if err != nil { log.Error(err, "unable to create startWorkspace context") @@ -204,8 +204,6 @@ func (r *WorkspaceReconciler) actOnStatus(ctx context.Context, workspace *worksp log.Error(err, "unable to create Pod for Workspace", "pod", pod) return ctrl.Result{Requeue: true}, err } else { - // TODO(cw): replicate the startup mechanism where pods can fail to be scheduled, - // need to be deleted and re-created // Must increment and persist the pod starts, and ensure we retry on conflict. // If we fail to persist this value, it's possible that the Pod gets recreated // when the workspace stops, due to PodStarts still being 0 when the original Pod @@ -221,6 +219,43 @@ func (r *WorkspaceReconciler) actOnStatus(ctx context.Context, workspace *worksp r.Recorder.Event(workspace, corev1.EventTypeNormal, "Creating", "") } + case workspace.Status.Phase == workspacev1.WorkspacePhaseStopped && workspace.IsConditionTrue(workspacev1.WorkspaceConditionPodRejected): + if workspace.Status.PodRecreated > r.Config.PodRecreationMaxRetries { + workspace.Status.SetCondition(workspacev1.NewWorkspaceConditionPodRejected(fmt.Sprintf("Pod reached maximum recreations %d, failing", workspace.Status.PodRecreated), metav1.ConditionFalse)) + return ctrl.Result{Requeue: true}, nil // requeue so we end up in the "Stopped" case below + } + + // Must persist the modification pod starts, and ensure we retry on conflict. + // If we fail to persist this value, it's possible that the Pod gets recreated endlessly + // when the workspace stops, due to PodStarts still being 0 when the original Pod + // disappears. + // Use a Patch instead of an Update, to prevent conflicts. + patch := client.MergeFrom(workspace.DeepCopy()) + + // Reset status + sc := workspace.Status.DeepCopy() + workspace.Status = workspacev1.WorkspaceStatus{} + workspace.Status.OwnerToken = sc.OwnerToken + workspace.Status.PodStarts = sc.PodStarts + workspace.Status.PodRecreated = sc.PodRecreated + 1 + workspace.Status.SetCondition(workspacev1.NewWorkspaceConditionPodRejected(fmt.Sprintf("Recreating pod... (%d retry)", workspace.Status.PodRecreated), metav1.ConditionFalse)) + + if err := r.Status().Patch(ctx, workspace, patch); err != nil { + log.Error(err, "Failed to patch workspace status-reset") + return ctrl.Result{}, err + } + + // Reset metrics cache + r.metrics.forgetWorkspace(workspace) + + requeueAfter := 5 * time.Second + if r.Config.PodRecreationBackoff != 0 { + requeueAfter = time.Duration(r.Config.PodRecreationBackoff) + } + + r.Recorder.Event(workspace, corev1.EventTypeNormal, "Recreating", "") + return ctrl.Result{Requeue: true, RequeueAfter: requeueAfter}, nil + case workspace.Status.Phase == workspacev1.WorkspacePhaseStopped: if err := r.deleteWorkspaceSecrets(ctx, workspace); err != nil { return ctrl.Result{}, err @@ -378,6 +413,11 @@ func (r *WorkspaceReconciler) updateMetrics(ctx context.Context, workspace *work lastState.recordedStartTime = true } + if lastState.recordedRecreations < workspace.Status.PodRecreated { + r.metrics.countWorkspaceRecreations(&log, workspace) + lastState.recordedRecreations = workspace.Status.PodRecreated + } + if workspace.Status.Phase == workspacev1.WorkspacePhaseStopped { r.metrics.countWorkspaceStop(&log, workspace) @@ -403,7 +443,9 @@ func isStartFailure(ws *workspacev1.Workspace) bool { isAborted := ws.IsConditionTrue(workspacev1.WorkspaceConditionAborted) // Also ignore workspaces that are requested to be stopped before they became ready. isStoppedByRequest := ws.IsConditionTrue(workspacev1.WorkspaceConditionStoppedByRequest) - return !everReady && !isAborted && !isStoppedByRequest + // Also ignore pods that got rejected by the node + isPodRejected := ws.IsConditionTrue(workspacev1.WorkspaceConditionPodRejected) + return !everReady && !isAborted && !isStoppedByRequest && !isPodRejected } func (r *WorkspaceReconciler) emitPhaseEvents(ctx context.Context, ws *workspacev1.Workspace, old *workspacev1.WorkspaceStatus) { diff --git a/components/ws-manager-mk2/controllers/workspace_controller_test.go b/components/ws-manager-mk2/controllers/workspace_controller_test.go index 5af06352e4ed0f..f21d01fe706dcd 100644 --- a/components/ws-manager-mk2/controllers/workspace_controller_test.go +++ b/components/ws-manager-mk2/controllers/workspace_controller_test.go @@ -389,6 +389,149 @@ var _ = Describe("WorkspaceController", func() { }) }) + It("pod rejection should result in a retry", func() { + ws := newWorkspace(uuid.NewString(), "default") + m := collectMetricCounts(wsMetrics, ws) + + // ### prepare block start + By("creating workspace") + // Simulate pod getting scheduled to a node. + var node corev1.Node + node.Name = uuid.NewString() + Expect(k8sClient.Create(ctx, &node)).To(Succeed()) + // Manually create the workspace pod with the node name. + // We can't update the pod with the node name, as this operation + // is only allowed for the scheduler. So as a hack, we manually + // create the workspace's pod. + pod := &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: fmt.Sprintf("ws-%s", ws.Name), + Namespace: ws.Namespace, + Finalizers: []string{workspacev1.GitpodFinalizerName}, + Labels: map[string]string{ + wsk8s.WorkspaceManagedByLabel: constants.ManagedBy, + }, + }, + Spec: corev1.PodSpec{ + NodeName: node.Name, + Containers: []corev1.Container{{ + Name: "workspace", + Image: "someimage", + }}, + }, + } + + Expect(k8sClient.Create(ctx, pod)).To(Succeed()) + pod = createWorkspaceExpectPod(ws) + updateObjWithRetries(k8sClient, pod, false, func(pod *corev1.Pod) { + Expect(ctrl.SetControllerReference(ws, pod, k8sClient.Scheme())).To(Succeed()) + }) + // mimic the regular "start" phase + updateObjWithRetries(k8sClient, ws, true, func(ws *workspacev1.Workspace) { + ws.Status.PodStarts = 1 + ws.Status.PodRecreated = 0 + }) + + // Wait until controller has reconciled at least once (by waiting for the runtime status to get updated). + // This is necessary for the metrics to get recorded correctly. If we don't wait, the first reconciliation + // might be once the Pod is already in a running state, and hence the metric state might not record e.g. content + // restore. + // This is only necessary because we manually created the pod, normally the Pod creation is the controller's + // first reconciliation which ensures the metrics are recorded from the workspace's initial state. + + Eventually(func(g Gomega) { + g.Expect(k8sClient.Get(ctx, types.NamespacedName{Name: ws.Name, Namespace: ws.Namespace}, ws)).To(Succeed()) + g.Expect(ws.Status.Runtime).ToNot(BeNil()) + g.Expect(ws.Status.Runtime.PodName).To(Equal(pod.Name)) + }, timeout, interval).Should(Succeed()) + + // Await "deployed" condition, and check we are good + expectConditionEventually(ws, string(workspacev1.WorkspaceConditionDeployed), metav1.ConditionTrue, "") + Eventually(func(g Gomega) { + g.Expect(k8sClient.Get(ctx, types.NamespacedName{Name: ws.Name, Namespace: ws.Namespace}, ws)).To(Succeed()) + g.Expect(ws.Status.PodStarts).To(Equal(1)) + g.Expect(ws.Status.PodRecreated).To(Equal(0)) + }, timeout, interval).Should(Succeed()) + + // ### prepare block end + + // ### trigger block start + // Make pod be rejected 🪄 + By("rejecting pod") + rejectPod(pod) + + // TODO(gpl): how to check for transient states like: + // - pod deletion + // - PodRejected condition + // By("await pod deleted") + // expectWorkspaceCleanup(ws, pod) + // Eventually(func(g Gomega) { + // g.Expect(k8sClient.Get(ctx, types.NamespacedName{Name: pod.Name, Namespace: pod.Namespace}, pod)).To(MatchError(ContainSubstring("not found"))) + // }, timeout, interval).Should(Succeed()) + + By("await pod recreation") + Eventually(func(g Gomega) { + g.Expect(k8sClient.Get(ctx, types.NamespacedName{Name: ws.Name, Namespace: ws.Namespace}, ws)).To(Succeed()) + g.Expect(ws.Status.PodRecreated).To(Equal(1)) + g.Expect(ws.Status.Phase).To(Equal(workspacev1.WorkspacePhasePending)) + }, timeout, interval).Should(Succeed()) + // ### trigger block end + + // ### retry block start + // Transition Pod to pending, and expect workspace to reach Creating phase. + // This should also cause create time metrics to be recorded. + updateObjWithRetries(k8sClient, pod, true, func(pod *corev1.Pod) { + pod.Status.Phase = corev1.PodPending + pod.Status.ContainerStatuses = []corev1.ContainerStatus{{ + State: corev1.ContainerState{ + Waiting: &corev1.ContainerStateWaiting{ + Reason: "ContainerCreating", + }, + }, + Name: "workspace", + }} + }) + + expectPhaseEventually(ws, workspacev1.WorkspacePhaseCreating) + // ### retry block end + + // ### move to running start + // Transition Pod to running, and expect workspace to reach Running phase. + // This should also cause e.g. startup time metrics to be recorded. + updateObjWithRetries(k8sClient, pod, true, func(pod *corev1.Pod) { + pod.Status.Phase = corev1.PodRunning + pod.Status.ContainerStatuses = []corev1.ContainerStatus{{ + Name: "workspace", + Ready: true, + }} + }) + + updateObjWithRetries(k8sClient, ws, true, func(ws *workspacev1.Workspace) { + ws.Status.SetCondition(workspacev1.NewWorkspaceConditionContentReady(metav1.ConditionTrue, workspacev1.ReasonInitializationSuccess, "")) + }) + + expectPhaseEventually(ws, workspacev1.WorkspacePhaseRunning) + // ### move to running end + + // ### validate start + Eventually(func(g Gomega) { + g.Expect(k8sClient.Get(ctx, types.NamespacedName{Name: ws.Name, Namespace: ws.Namespace}, ws)).To(Succeed()) + g.Expect(ws.Status.PodStarts).To(Equal(2)) + g.Expect(ws.Status.PodRecreated).To(Equal(1)) + }, timeout, interval).Should(Succeed()) + + expectMetricsDelta(m, collectMetricCounts(wsMetrics, ws), metricCounts{ + restores: 1, + backups: 0, + backupFailures: 0, + failures: 1, + creatingCounts: 1, + stops: map[StopReason]int{StopReasonStartFailure: 1}, + starts: 1, // this is NOT PodStarts, but merely an artifact of how we count it in the tests + recreations: map[int]int{1: 1}, + }) + // ### validate end + }) }) Context("with headless workspaces", func() { @@ -634,6 +777,16 @@ func requestStop(ws *workspacev1.Workspace) { }) } +func rejectPod(pod *corev1.Pod) { + GinkgoHelper() + By("adding pod rejected condition") + updateObjWithRetries(k8sClient, pod, true, func(pod *corev1.Pod) { + pod.Status.Phase = corev1.PodFailed + pod.Status.Reason = "NodeAffinity" + pod.Status.Message = "Pod was rejected" + }) +} + func markReady(ws *workspacev1.Workspace) { GinkgoHelper() By("adding content ready condition") @@ -829,6 +982,7 @@ type metricCounts struct { startFailures int failures int stops map[StopReason]int + recreations map[int]int backups int backupFailures int restores int @@ -855,12 +1009,17 @@ func collectMetricCounts(wsMetrics *controllerMetrics, ws *workspacev1.Workspace for _, reason := range stopReasons { stopCounts[reason] = int(testutil.ToFloat64(wsMetrics.totalStopsCounterVec.WithLabelValues(string(reason), tpe, cls))) } + recreations := make(map[int]int) + for _, attempts := range []int{1, 2, 3, 4, 5} { + recreations[attempts] = int(testutil.ToFloat64(wsMetrics.totalRecreationsCounterVec.WithLabelValues(tpe, cls, fmt.Sprint(attempts)))) + } return metricCounts{ starts: int(collectHistCount(startHist)), creatingCounts: int(collectHistCount(creatingHist)), startFailures: int(testutil.ToFloat64(wsMetrics.totalStartsFailureCounterVec.WithLabelValues(tpe, cls))), failures: int(testutil.ToFloat64(wsMetrics.totalFailuresCounterVec.WithLabelValues(tpe, cls))), stops: stopCounts, + recreations: recreations, backups: int(testutil.ToFloat64(wsMetrics.totalBackupCounterVec.WithLabelValues(tpe, cls))), backupFailures: int(testutil.ToFloat64(wsMetrics.totalBackupFailureCounterVec.WithLabelValues(tpe, cls))), restores: int(testutil.ToFloat64(wsMetrics.totalRestoreCounterVec.WithLabelValues(tpe, cls))),