From c895e84b03802d18ecadc5bc2bbb728179ede889 Mon Sep 17 00:00:00 2001
From: Gero Posmyk-Leinemann <gero@gitpod.io>
Date: Wed, 25 Sep 2024 13:40:11 +0000
Subject: [PATCH] [ws-manager] Re-create workspace pods (incl. test)

---
 components/ws-manager-api/go/config/config.go |   5 +
 .../go/crd/v1/workspace_types.go              |  19 ++-
 .../bases/workspace.gitpod.io_workspaces.yaml |   2 +
 .../ws-manager-mk2/controllers/metrics.go     |  18 ++
 .../ws-manager-mk2/controllers/status.go      |  10 ++
 .../controllers/subscriber_controller.go      |   6 +
 .../controllers/workspace_controller.go       |  50 +++++-
 .../controllers/workspace_controller_test.go  | 159 ++++++++++++++++++
 8 files changed, 262 insertions(+), 7 deletions(-)

diff --git a/components/ws-manager-api/go/config/config.go b/components/ws-manager-api/go/config/config.go
index 62a715ba0a8482..36b8e8b6a83540 100644
--- a/components/ws-manager-api/go/config/config.go
+++ b/components/ws-manager-api/go/config/config.go
@@ -142,6 +142,11 @@ type Configuration struct {
 
 	// SSHGatewayCAPublicKey is a CA public key
 	SSHGatewayCAPublicKey string
+
+	// PodRecreationMaxRetries
+	PodRecreationMaxRetries int `json:"podRecreationMaxRetries,omitempty"`
+	// PodRecreationBackoff
+	PodRecreationBackoff util.Duration `json:"podRecreationBackoff,omitempty"`
 }
 
 type WorkspaceClass struct {
diff --git a/components/ws-manager-api/go/crd/v1/workspace_types.go b/components/ws-manager-api/go/crd/v1/workspace_types.go
index b1af101a6f96c6..c27cc118c3e03f 100644
--- a/components/ws-manager-api/go/crd/v1/workspace_types.go
+++ b/components/ws-manager-api/go/crd/v1/workspace_types.go
@@ -170,9 +170,10 @@ func (ps PortSpec) Equal(other PortSpec) bool {
 
 // WorkspaceStatus defines the observed state of Workspace
 type WorkspaceStatus struct {
-	PodStarts  int    `json:"podStarts"`
-	URL        string `json:"url,omitempty" scrub:"redact"`
-	OwnerToken string `json:"ownerToken,omitempty" scrub:"redact"`
+	PodStarts    int    `json:"podStarts"`
+	PodRecreated int    `json:"podRecreated"`
+	URL          string `json:"url,omitempty" scrub:"redact"`
+	OwnerToken   string `json:"ownerToken,omitempty" scrub:"redact"`
 
 	// +kubebuilder:default=Unknown
 	Phase WorkspacePhase `json:"phase,omitempty"`
@@ -263,6 +264,9 @@ const (
 	// WorkspaceContainerRunning is true if the workspace container is running.
 	// Used to determine if a backup can be taken, only once the container is stopped.
 	WorkspaceConditionContainerRunning WorkspaceCondition = "WorkspaceContainerRunning"
+
+	// WorkspaceConditionPodRejected is true if we detected that the pod was rejected by the node
+	WorkspaceConditionPodRejected WorkspaceCondition = "PodRejected"
 )
 
 func NewWorkspaceConditionDeployed() metav1.Condition {
@@ -291,6 +295,15 @@ func NewWorkspaceConditionFailed(message string) metav1.Condition {
 	}
 }
 
+func NewWorkspaceConditionPodRejected(message string, status metav1.ConditionStatus) metav1.Condition {
+	return metav1.Condition{
+		Type:               string(WorkspaceConditionPodRejected),
+		LastTransitionTime: metav1.Now(),
+		Status:             status,
+		Message:            message,
+	}
+}
+
 func NewWorkspaceConditionTimeout(message string) metav1.Condition {
 	return metav1.Condition{
 		Type:               string(WorkspaceConditionTimeout),
diff --git a/components/ws-manager-mk2/config/crd/bases/workspace.gitpod.io_workspaces.yaml b/components/ws-manager-mk2/config/crd/bases/workspace.gitpod.io_workspaces.yaml
index 72dbd0d7179c95..016a04232c916f 100644
--- a/components/ws-manager-mk2/config/crd/bases/workspace.gitpod.io_workspaces.yaml
+++ b/components/ws-manager-mk2/config/crd/bases/workspace.gitpod.io_workspaces.yaml
@@ -543,6 +543,8 @@ spec:
                 type: string
               podStarts:
                 type: integer
+              podRecreated:
+                type: integer
               runtime:
                 properties:
                   hostIP:
diff --git a/components/ws-manager-mk2/controllers/metrics.go b/components/ws-manager-mk2/controllers/metrics.go
index 72f1eb1ace246f..f182a5f4c2c976 100644
--- a/components/ws-manager-mk2/controllers/metrics.go
+++ b/components/ws-manager-mk2/controllers/metrics.go
@@ -30,6 +30,7 @@ const (
 	workspaceStartFailuresTotal   string = "workspace_starts_failure_total"
 	workspaceFailuresTotal        string = "workspace_failure_total"
 	workspaceStopsTotal           string = "workspace_stops_total"
+	workspaceRecreationsTotal     string = "workspace_recreations_total"
 	workspaceBackupsTotal         string = "workspace_backups_total"
 	workspaceBackupFailuresTotal  string = "workspace_backups_failure_total"
 	workspaceRestoresTotal        string = "workspace_restores_total"
@@ -57,6 +58,7 @@ type controllerMetrics struct {
 	totalStartsFailureCounterVec *prometheus.CounterVec
 	totalFailuresCounterVec      *prometheus.CounterVec
 	totalStopsCounterVec         *prometheus.CounterVec
+	totalRecreationsCounterVec   *prometheus.CounterVec
 
 	totalBackupCounterVec         *prometheus.CounterVec
 	totalBackupFailureCounterVec  *prometheus.CounterVec
@@ -120,6 +122,12 @@ func newControllerMetrics(r *WorkspaceReconciler) (*controllerMetrics, error) {
 			Name:      workspaceStopsTotal,
 			Help:      "total number of workspaces stopped",
 		}, []string{"reason", "type", "class"}),
+		totalRecreationsCounterVec: prometheus.NewCounterVec(prometheus.CounterOpts{
+			Namespace: metricsNamespace,
+			Subsystem: metricsWorkspaceSubsystem,
+			Name:      workspaceRecreationsTotal,
+			Help:      "total number of workspace recreations",
+		}, []string{"type", "class", "attempt"}),
 
 		totalBackupCounterVec: prometheus.NewCounterVec(prometheus.CounterOpts{
 			Namespace: metricsNamespace,
@@ -233,6 +241,14 @@ func (m *controllerMetrics) countWorkspaceStop(log *logr.Logger, ws *workspacev1
 	m.totalStopsCounterVec.WithLabelValues(reason, tpe, class).Inc()
 }
 
+func (m *controllerMetrics) countWorkspaceRecreations(log *logr.Logger, ws *workspacev1.Workspace) {
+	class := ws.Spec.Class
+	tpe := string(ws.Spec.Type)
+	attempt := fmt.Sprint(ws.Status.PodRecreated)
+
+	m.totalRecreationsCounterVec.WithLabelValues(tpe, class, attempt).Inc()
+}
+
 func (m *controllerMetrics) countTotalBackups(log *logr.Logger, ws *workspacev1.Workspace) {
 	class := ws.Spec.Class
 	tpe := string(ws.Spec.Type)
@@ -291,6 +307,7 @@ type metricState struct {
 	recordedContentReady    bool
 	recordedBackupFailed    bool
 	recordedBackupCompleted bool
+	recordedRecreations     int
 }
 
 func newMetricState(ws *workspacev1.Workspace) metricState {
@@ -306,6 +323,7 @@ func newMetricState(ws *workspacev1.Workspace) metricState {
 		recordedContentReady:    ws.IsConditionTrue(workspacev1.WorkspaceConditionContentReady),
 		recordedBackupFailed:    ws.IsConditionTrue(workspacev1.WorkspaceConditionBackupFailure),
 		recordedBackupCompleted: ws.IsConditionTrue(workspacev1.WorkspaceConditionBackupComplete),
+		recordedRecreations:     ws.Status.PodRecreated,
 	}
 }
 
diff --git a/components/ws-manager-mk2/controllers/status.go b/components/ws-manager-mk2/controllers/status.go
index f495dfde1ccf7f..a219d4d8b13f2f 100644
--- a/components/ws-manager-mk2/controllers/status.go
+++ b/components/ws-manager-mk2/controllers/status.go
@@ -123,6 +123,16 @@ func (r *WorkspaceReconciler) updateWorkspaceStatus(ctx context.Context, workspa
 		workspace.Status.Phase = *phase
 	}
 
+	if failure != "" && !workspace.IsConditionTrue(workspacev1.WorkspaceConditionFailed) {
+		// Check: A situation where we want to retry?
+		if pod.Status.Phase == corev1.PodFailed && (pod.Status.Reason == "NodeAffinity" || pod.Status.Reason == "OutOfCPU") && strings.HasPrefix(pod.Status.Message, "Pod was rejected") {
+			// This is a situation where we want to re-create the pod!
+			log.Info("workspace scheduling failed", "workspace", workspace.Name, "reason", failure)
+			workspace.Status.SetCondition(workspacev1.NewWorkspaceConditionPodRejected(failure, metav1.ConditionTrue))
+			r.Recorder.Event(workspace, corev1.EventTypeWarning, "PodRejected", failure)
+		}
+	}
+
 	if failure != "" && !workspace.IsConditionTrue(workspacev1.WorkspaceConditionFailed) {
 		// workspaces can fail only once - once there is a failed condition set, stick with it
 		log.Info("workspace failed", "workspace", workspace.Name, "reason", failure)
diff --git a/components/ws-manager-mk2/controllers/subscriber_controller.go b/components/ws-manager-mk2/controllers/subscriber_controller.go
index 0349d28e028cf0..70a0ce27c3bfe9 100644
--- a/components/ws-manager-mk2/controllers/subscriber_controller.go
+++ b/components/ws-manager-mk2/controllers/subscriber_controller.go
@@ -61,6 +61,12 @@ func (r *SubscriberReconciler) Reconcile(ctx context.Context, req ctrl.Request)
 		workspace.Status.Conditions = []metav1.Condition{}
 	}
 
+	if workspace.IsConditionTrue(workspacev1.WorkspaceConditionPodRejected) {
+		// In this situation, we are about to re-create the pod. We don't want clients to see all the "stopping, stopped, starting" chatter, so we hide it here.
+		// TODO(gpl) Is this a sane approach?
+		return ctrl.Result{}, nil
+	}
+
 	if r.OnReconcile != nil {
 		r.OnReconcile(ctx, &workspace)
 	}
diff --git a/components/ws-manager-mk2/controllers/workspace_controller.go b/components/ws-manager-mk2/controllers/workspace_controller.go
index 55301df5072b18..d6f05067aef694 100644
--- a/components/ws-manager-mk2/controllers/workspace_controller.go
+++ b/components/ws-manager-mk2/controllers/workspace_controller.go
@@ -180,7 +180,7 @@ func (r *WorkspaceReconciler) actOnStatus(ctx context.Context, workspace *worksp
 	if len(workspacePods.Items) == 0 {
 		// if there isn't a workspace pod and we're not currently deleting this workspace,// create one.
 		switch {
-		case workspace.Status.PodStarts == 0:
+		case workspace.Status.PodStarts == 0 || workspace.Status.PodStarts-workspace.Status.PodRecreated < 1:
 			sctx, err := newStartWorkspaceContext(ctx, r.Config, workspace)
 			if err != nil {
 				log.Error(err, "unable to create startWorkspace context")
@@ -204,8 +204,6 @@ func (r *WorkspaceReconciler) actOnStatus(ctx context.Context, workspace *worksp
 				log.Error(err, "unable to create Pod for Workspace", "pod", pod)
 				return ctrl.Result{Requeue: true}, err
 			} else {
-				// TODO(cw): replicate the startup mechanism where pods can fail to be scheduled,
-				//			 need to be deleted and re-created
 				// Must increment and persist the pod starts, and ensure we retry on conflict.
 				// If we fail to persist this value, it's possible that the Pod gets recreated
 				// when the workspace stops, due to PodStarts still being 0 when the original Pod
@@ -221,6 +219,43 @@ func (r *WorkspaceReconciler) actOnStatus(ctx context.Context, workspace *worksp
 				r.Recorder.Event(workspace, corev1.EventTypeNormal, "Creating", "")
 			}
 
+		case workspace.Status.Phase == workspacev1.WorkspacePhaseStopped && workspace.IsConditionTrue(workspacev1.WorkspaceConditionPodRejected):
+			if workspace.Status.PodRecreated > r.Config.PodRecreationMaxRetries {
+				workspace.Status.SetCondition(workspacev1.NewWorkspaceConditionPodRejected(fmt.Sprintf("Pod reached maximum recreations %d, failing", workspace.Status.PodRecreated), metav1.ConditionFalse))
+				return ctrl.Result{Requeue: true}, nil // requeue so we end up in the "Stopped" case below
+			}
+
+			// Must persist the modification pod starts, and ensure we retry on conflict.
+			// If we fail to persist this value, it's possible that the Pod gets recreated endlessly
+			// when the workspace stops, due to PodStarts still being 0 when the original Pod
+			// disappears.
+			// Use a Patch instead of an Update, to prevent conflicts.
+			patch := client.MergeFrom(workspace.DeepCopy())
+
+			// Reset status
+			sc := workspace.Status.DeepCopy()
+			workspace.Status = workspacev1.WorkspaceStatus{}
+			workspace.Status.OwnerToken = sc.OwnerToken
+			workspace.Status.PodStarts = sc.PodStarts
+			workspace.Status.PodRecreated = sc.PodRecreated + 1
+			workspace.Status.SetCondition(workspacev1.NewWorkspaceConditionPodRejected(fmt.Sprintf("Recreating pod... (%d retry)", workspace.Status.PodRecreated), metav1.ConditionFalse))
+
+			if err := r.Status().Patch(ctx, workspace, patch); err != nil {
+				log.Error(err, "Failed to patch workspace status-reset")
+				return ctrl.Result{}, err
+			}
+
+			// Reset metrics cache
+			r.metrics.forgetWorkspace(workspace)
+
+			requeueAfter := 5 * time.Second
+			if r.Config.PodRecreationBackoff != 0 {
+				requeueAfter = time.Duration(r.Config.PodRecreationBackoff)
+			}
+
+			r.Recorder.Event(workspace, corev1.EventTypeNormal, "Recreating", "")
+			return ctrl.Result{Requeue: true, RequeueAfter: requeueAfter}, nil
+
 		case workspace.Status.Phase == workspacev1.WorkspacePhaseStopped:
 			if err := r.deleteWorkspaceSecrets(ctx, workspace); err != nil {
 				return ctrl.Result{}, err
@@ -378,6 +413,11 @@ func (r *WorkspaceReconciler) updateMetrics(ctx context.Context, workspace *work
 		lastState.recordedStartTime = true
 	}
 
+	if lastState.recordedRecreations < workspace.Status.PodRecreated {
+		r.metrics.countWorkspaceRecreations(&log, workspace)
+		lastState.recordedRecreations = workspace.Status.PodRecreated
+	}
+
 	if workspace.Status.Phase == workspacev1.WorkspacePhaseStopped {
 		r.metrics.countWorkspaceStop(&log, workspace)
 
@@ -403,7 +443,9 @@ func isStartFailure(ws *workspacev1.Workspace) bool {
 	isAborted := ws.IsConditionTrue(workspacev1.WorkspaceConditionAborted)
 	// Also ignore workspaces that are requested to be stopped before they became ready.
 	isStoppedByRequest := ws.IsConditionTrue(workspacev1.WorkspaceConditionStoppedByRequest)
-	return !everReady && !isAborted && !isStoppedByRequest
+	// Also ignore pods that got rejected by the node
+	isPodRejected := ws.IsConditionTrue(workspacev1.WorkspaceConditionPodRejected)
+	return !everReady && !isAborted && !isStoppedByRequest && !isPodRejected
 }
 
 func (r *WorkspaceReconciler) emitPhaseEvents(ctx context.Context, ws *workspacev1.Workspace, old *workspacev1.WorkspaceStatus) {
diff --git a/components/ws-manager-mk2/controllers/workspace_controller_test.go b/components/ws-manager-mk2/controllers/workspace_controller_test.go
index 5af06352e4ed0f..f21d01fe706dcd 100644
--- a/components/ws-manager-mk2/controllers/workspace_controller_test.go
+++ b/components/ws-manager-mk2/controllers/workspace_controller_test.go
@@ -389,6 +389,149 @@ var _ = Describe("WorkspaceController", func() {
 			})
 		})
 
+		It("pod rejection should result in a retry", func() {
+			ws := newWorkspace(uuid.NewString(), "default")
+			m := collectMetricCounts(wsMetrics, ws)
+
+			// ### prepare block start
+			By("creating workspace")
+			// Simulate pod getting scheduled to a node.
+			var node corev1.Node
+			node.Name = uuid.NewString()
+			Expect(k8sClient.Create(ctx, &node)).To(Succeed())
+			// Manually create the workspace pod with the node name.
+			// We can't update the pod with the node name, as this operation
+			// is only allowed for the scheduler. So as a hack, we manually
+			// create the workspace's pod.
+			pod := &corev1.Pod{
+				ObjectMeta: metav1.ObjectMeta{
+					Name:       fmt.Sprintf("ws-%s", ws.Name),
+					Namespace:  ws.Namespace,
+					Finalizers: []string{workspacev1.GitpodFinalizerName},
+					Labels: map[string]string{
+						wsk8s.WorkspaceManagedByLabel: constants.ManagedBy,
+					},
+				},
+				Spec: corev1.PodSpec{
+					NodeName: node.Name,
+					Containers: []corev1.Container{{
+						Name:  "workspace",
+						Image: "someimage",
+					}},
+				},
+			}
+
+			Expect(k8sClient.Create(ctx, pod)).To(Succeed())
+			pod = createWorkspaceExpectPod(ws)
+			updateObjWithRetries(k8sClient, pod, false, func(pod *corev1.Pod) {
+				Expect(ctrl.SetControllerReference(ws, pod, k8sClient.Scheme())).To(Succeed())
+			})
+			// mimic the regular "start" phase
+			updateObjWithRetries(k8sClient, ws, true, func(ws *workspacev1.Workspace) {
+				ws.Status.PodStarts = 1
+				ws.Status.PodRecreated = 0
+			})
+
+			// Wait until controller has reconciled at least once (by waiting for the runtime status to get updated).
+			// This is necessary for the metrics to get recorded correctly. If we don't wait, the first reconciliation
+			// might be once the Pod is already in a running state, and hence the metric state might not record e.g. content
+			// restore.
+			// This is only necessary because we manually created the pod, normally the Pod creation is the controller's
+			// first reconciliation which ensures the metrics are recorded from the workspace's initial state.
+
+			Eventually(func(g Gomega) {
+				g.Expect(k8sClient.Get(ctx, types.NamespacedName{Name: ws.Name, Namespace: ws.Namespace}, ws)).To(Succeed())
+				g.Expect(ws.Status.Runtime).ToNot(BeNil())
+				g.Expect(ws.Status.Runtime.PodName).To(Equal(pod.Name))
+			}, timeout, interval).Should(Succeed())
+
+			// Await "deployed" condition, and check we are good
+			expectConditionEventually(ws, string(workspacev1.WorkspaceConditionDeployed), metav1.ConditionTrue, "")
+			Eventually(func(g Gomega) {
+				g.Expect(k8sClient.Get(ctx, types.NamespacedName{Name: ws.Name, Namespace: ws.Namespace}, ws)).To(Succeed())
+				g.Expect(ws.Status.PodStarts).To(Equal(1))
+				g.Expect(ws.Status.PodRecreated).To(Equal(0))
+			}, timeout, interval).Should(Succeed())
+
+			// ### prepare block end
+
+			// ### trigger block start
+			// Make pod be rejected 🪄
+			By("rejecting pod")
+			rejectPod(pod)
+
+			// TODO(gpl): how to check for transient states like:
+			// - pod deletion
+			// - PodRejected condition
+			// By("await pod deleted")
+			// expectWorkspaceCleanup(ws, pod)
+			// Eventually(func(g Gomega) {
+			// 	g.Expect(k8sClient.Get(ctx, types.NamespacedName{Name: pod.Name, Namespace: pod.Namespace}, pod)).To(MatchError(ContainSubstring("not found")))
+			// }, timeout, interval).Should(Succeed())
+
+			By("await pod recreation")
+			Eventually(func(g Gomega) {
+				g.Expect(k8sClient.Get(ctx, types.NamespacedName{Name: ws.Name, Namespace: ws.Namespace}, ws)).To(Succeed())
+				g.Expect(ws.Status.PodRecreated).To(Equal(1))
+				g.Expect(ws.Status.Phase).To(Equal(workspacev1.WorkspacePhasePending))
+			}, timeout, interval).Should(Succeed())
+			// ### trigger block end
+
+			// ### retry block start
+			// Transition Pod to pending, and expect workspace to reach Creating phase.
+			// This should also cause create time metrics to be recorded.
+			updateObjWithRetries(k8sClient, pod, true, func(pod *corev1.Pod) {
+				pod.Status.Phase = corev1.PodPending
+				pod.Status.ContainerStatuses = []corev1.ContainerStatus{{
+					State: corev1.ContainerState{
+						Waiting: &corev1.ContainerStateWaiting{
+							Reason: "ContainerCreating",
+						},
+					},
+					Name: "workspace",
+				}}
+			})
+
+			expectPhaseEventually(ws, workspacev1.WorkspacePhaseCreating)
+			// ### retry block end
+
+			// ### move to running start
+			// Transition Pod to running, and expect workspace to reach Running phase.
+			// This should also cause e.g. startup time metrics to be recorded.
+			updateObjWithRetries(k8sClient, pod, true, func(pod *corev1.Pod) {
+				pod.Status.Phase = corev1.PodRunning
+				pod.Status.ContainerStatuses = []corev1.ContainerStatus{{
+					Name:  "workspace",
+					Ready: true,
+				}}
+			})
+
+			updateObjWithRetries(k8sClient, ws, true, func(ws *workspacev1.Workspace) {
+				ws.Status.SetCondition(workspacev1.NewWorkspaceConditionContentReady(metav1.ConditionTrue, workspacev1.ReasonInitializationSuccess, ""))
+			})
+
+			expectPhaseEventually(ws, workspacev1.WorkspacePhaseRunning)
+			// ### move to running end
+
+			// ### validate start
+			Eventually(func(g Gomega) {
+				g.Expect(k8sClient.Get(ctx, types.NamespacedName{Name: ws.Name, Namespace: ws.Namespace}, ws)).To(Succeed())
+				g.Expect(ws.Status.PodStarts).To(Equal(2))
+				g.Expect(ws.Status.PodRecreated).To(Equal(1))
+			}, timeout, interval).Should(Succeed())
+
+			expectMetricsDelta(m, collectMetricCounts(wsMetrics, ws), metricCounts{
+				restores:       1,
+				backups:        0,
+				backupFailures: 0,
+				failures:       1,
+				creatingCounts: 1,
+				stops:          map[StopReason]int{StopReasonStartFailure: 1},
+				starts:         1, // this is NOT PodStarts, but merely an artifact of how we count it in the tests
+				recreations:    map[int]int{1: 1},
+			})
+			// ### validate end
+		})
 	})
 
 	Context("with headless workspaces", func() {
@@ -634,6 +777,16 @@ func requestStop(ws *workspacev1.Workspace) {
 	})
 }
 
+func rejectPod(pod *corev1.Pod) {
+	GinkgoHelper()
+	By("adding pod rejected condition")
+	updateObjWithRetries(k8sClient, pod, true, func(pod *corev1.Pod) {
+		pod.Status.Phase = corev1.PodFailed
+		pod.Status.Reason = "NodeAffinity"
+		pod.Status.Message = "Pod was rejected"
+	})
+}
+
 func markReady(ws *workspacev1.Workspace) {
 	GinkgoHelper()
 	By("adding content ready condition")
@@ -829,6 +982,7 @@ type metricCounts struct {
 	startFailures   int
 	failures        int
 	stops           map[StopReason]int
+	recreations     map[int]int
 	backups         int
 	backupFailures  int
 	restores        int
@@ -855,12 +1009,17 @@ func collectMetricCounts(wsMetrics *controllerMetrics, ws *workspacev1.Workspace
 	for _, reason := range stopReasons {
 		stopCounts[reason] = int(testutil.ToFloat64(wsMetrics.totalStopsCounterVec.WithLabelValues(string(reason), tpe, cls)))
 	}
+	recreations := make(map[int]int)
+	for _, attempts := range []int{1, 2, 3, 4, 5} {
+		recreations[attempts] = int(testutil.ToFloat64(wsMetrics.totalRecreationsCounterVec.WithLabelValues(tpe, cls, fmt.Sprint(attempts))))
+	}
 	return metricCounts{
 		starts:          int(collectHistCount(startHist)),
 		creatingCounts:  int(collectHistCount(creatingHist)),
 		startFailures:   int(testutil.ToFloat64(wsMetrics.totalStartsFailureCounterVec.WithLabelValues(tpe, cls))),
 		failures:        int(testutil.ToFloat64(wsMetrics.totalFailuresCounterVec.WithLabelValues(tpe, cls))),
 		stops:           stopCounts,
+		recreations:     recreations,
 		backups:         int(testutil.ToFloat64(wsMetrics.totalBackupCounterVec.WithLabelValues(tpe, cls))),
 		backupFailures:  int(testutil.ToFloat64(wsMetrics.totalBackupFailureCounterVec.WithLabelValues(tpe, cls))),
 		restores:        int(testutil.ToFloat64(wsMetrics.totalRestoreCounterVec.WithLabelValues(tpe, cls))),