Skip to content

Commit

Permalink
[ws-manager] Re-create workspace pods (incl. test)
Browse files Browse the repository at this point in the history
  • Loading branch information
geropl committed Sep 25, 2024
1 parent abb191f commit c895e84
Show file tree
Hide file tree
Showing 8 changed files with 262 additions and 7 deletions.
5 changes: 5 additions & 0 deletions components/ws-manager-api/go/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -142,6 +142,11 @@ type Configuration struct {

// SSHGatewayCAPublicKey is a CA public key
SSHGatewayCAPublicKey string

// PodRecreationMaxRetries
PodRecreationMaxRetries int `json:"podRecreationMaxRetries,omitempty"`
// PodRecreationBackoff
PodRecreationBackoff util.Duration `json:"podRecreationBackoff,omitempty"`
}

type WorkspaceClass struct {
Expand Down
19 changes: 16 additions & 3 deletions components/ws-manager-api/go/crd/v1/workspace_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -170,9 +170,10 @@ func (ps PortSpec) Equal(other PortSpec) bool {

// WorkspaceStatus defines the observed state of Workspace
type WorkspaceStatus struct {
PodStarts int `json:"podStarts"`
URL string `json:"url,omitempty" scrub:"redact"`
OwnerToken string `json:"ownerToken,omitempty" scrub:"redact"`
PodStarts int `json:"podStarts"`
PodRecreated int `json:"podRecreated"`
URL string `json:"url,omitempty" scrub:"redact"`
OwnerToken string `json:"ownerToken,omitempty" scrub:"redact"`

// +kubebuilder:default=Unknown
Phase WorkspacePhase `json:"phase,omitempty"`
Expand Down Expand Up @@ -263,6 +264,9 @@ const (
// WorkspaceContainerRunning is true if the workspace container is running.
// Used to determine if a backup can be taken, only once the container is stopped.
WorkspaceConditionContainerRunning WorkspaceCondition = "WorkspaceContainerRunning"

// WorkspaceConditionPodRejected is true if we detected that the pod was rejected by the node
WorkspaceConditionPodRejected WorkspaceCondition = "PodRejected"
)

func NewWorkspaceConditionDeployed() metav1.Condition {
Expand Down Expand Up @@ -291,6 +295,15 @@ func NewWorkspaceConditionFailed(message string) metav1.Condition {
}
}

func NewWorkspaceConditionPodRejected(message string, status metav1.ConditionStatus) metav1.Condition {
return metav1.Condition{
Type: string(WorkspaceConditionPodRejected),
LastTransitionTime: metav1.Now(),
Status: status,
Message: message,
}
}

func NewWorkspaceConditionTimeout(message string) metav1.Condition {
return metav1.Condition{
Type: string(WorkspaceConditionTimeout),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -543,6 +543,8 @@ spec:
type: string
podStarts:
type: integer
podRecreated:
type: integer
runtime:
properties:
hostIP:
Expand Down
18 changes: 18 additions & 0 deletions components/ws-manager-mk2/controllers/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ const (
workspaceStartFailuresTotal string = "workspace_starts_failure_total"
workspaceFailuresTotal string = "workspace_failure_total"
workspaceStopsTotal string = "workspace_stops_total"
workspaceRecreationsTotal string = "workspace_recreations_total"
workspaceBackupsTotal string = "workspace_backups_total"
workspaceBackupFailuresTotal string = "workspace_backups_failure_total"
workspaceRestoresTotal string = "workspace_restores_total"
Expand Down Expand Up @@ -57,6 +58,7 @@ type controllerMetrics struct {
totalStartsFailureCounterVec *prometheus.CounterVec
totalFailuresCounterVec *prometheus.CounterVec
totalStopsCounterVec *prometheus.CounterVec
totalRecreationsCounterVec *prometheus.CounterVec

totalBackupCounterVec *prometheus.CounterVec
totalBackupFailureCounterVec *prometheus.CounterVec
Expand Down Expand Up @@ -120,6 +122,12 @@ func newControllerMetrics(r *WorkspaceReconciler) (*controllerMetrics, error) {
Name: workspaceStopsTotal,
Help: "total number of workspaces stopped",
}, []string{"reason", "type", "class"}),
totalRecreationsCounterVec: prometheus.NewCounterVec(prometheus.CounterOpts{
Namespace: metricsNamespace,
Subsystem: metricsWorkspaceSubsystem,
Name: workspaceRecreationsTotal,
Help: "total number of workspace recreations",
}, []string{"type", "class", "attempt"}),

totalBackupCounterVec: prometheus.NewCounterVec(prometheus.CounterOpts{
Namespace: metricsNamespace,
Expand Down Expand Up @@ -233,6 +241,14 @@ func (m *controllerMetrics) countWorkspaceStop(log *logr.Logger, ws *workspacev1
m.totalStopsCounterVec.WithLabelValues(reason, tpe, class).Inc()
}

func (m *controllerMetrics) countWorkspaceRecreations(log *logr.Logger, ws *workspacev1.Workspace) {
class := ws.Spec.Class
tpe := string(ws.Spec.Type)
attempt := fmt.Sprint(ws.Status.PodRecreated)

m.totalRecreationsCounterVec.WithLabelValues(tpe, class, attempt).Inc()
}

func (m *controllerMetrics) countTotalBackups(log *logr.Logger, ws *workspacev1.Workspace) {
class := ws.Spec.Class
tpe := string(ws.Spec.Type)
Expand Down Expand Up @@ -291,6 +307,7 @@ type metricState struct {
recordedContentReady bool
recordedBackupFailed bool
recordedBackupCompleted bool
recordedRecreations int
}

func newMetricState(ws *workspacev1.Workspace) metricState {
Expand All @@ -306,6 +323,7 @@ func newMetricState(ws *workspacev1.Workspace) metricState {
recordedContentReady: ws.IsConditionTrue(workspacev1.WorkspaceConditionContentReady),
recordedBackupFailed: ws.IsConditionTrue(workspacev1.WorkspaceConditionBackupFailure),
recordedBackupCompleted: ws.IsConditionTrue(workspacev1.WorkspaceConditionBackupComplete),
recordedRecreations: ws.Status.PodRecreated,
}
}

Expand Down
10 changes: 10 additions & 0 deletions components/ws-manager-mk2/controllers/status.go
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,16 @@ func (r *WorkspaceReconciler) updateWorkspaceStatus(ctx context.Context, workspa
workspace.Status.Phase = *phase
}

if failure != "" && !workspace.IsConditionTrue(workspacev1.WorkspaceConditionFailed) {
// Check: A situation where we want to retry?
if pod.Status.Phase == corev1.PodFailed && (pod.Status.Reason == "NodeAffinity" || pod.Status.Reason == "OutOfCPU") && strings.HasPrefix(pod.Status.Message, "Pod was rejected") {
// This is a situation where we want to re-create the pod!
log.Info("workspace scheduling failed", "workspace", workspace.Name, "reason", failure)
workspace.Status.SetCondition(workspacev1.NewWorkspaceConditionPodRejected(failure, metav1.ConditionTrue))
r.Recorder.Event(workspace, corev1.EventTypeWarning, "PodRejected", failure)
}
}

if failure != "" && !workspace.IsConditionTrue(workspacev1.WorkspaceConditionFailed) {
// workspaces can fail only once - once there is a failed condition set, stick with it
log.Info("workspace failed", "workspace", workspace.Name, "reason", failure)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,12 @@ func (r *SubscriberReconciler) Reconcile(ctx context.Context, req ctrl.Request)
workspace.Status.Conditions = []metav1.Condition{}
}

if workspace.IsConditionTrue(workspacev1.WorkspaceConditionPodRejected) {
// In this situation, we are about to re-create the pod. We don't want clients to see all the "stopping, stopped, starting" chatter, so we hide it here.
// TODO(gpl) Is this a sane approach?
return ctrl.Result{}, nil
}

if r.OnReconcile != nil {
r.OnReconcile(ctx, &workspace)
}
Expand Down
50 changes: 46 additions & 4 deletions components/ws-manager-mk2/controllers/workspace_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -180,7 +180,7 @@ func (r *WorkspaceReconciler) actOnStatus(ctx context.Context, workspace *worksp
if len(workspacePods.Items) == 0 {
// if there isn't a workspace pod and we're not currently deleting this workspace,// create one.
switch {
case workspace.Status.PodStarts == 0:
case workspace.Status.PodStarts == 0 || workspace.Status.PodStarts-workspace.Status.PodRecreated < 1:
sctx, err := newStartWorkspaceContext(ctx, r.Config, workspace)
if err != nil {
log.Error(err, "unable to create startWorkspace context")
Expand All @@ -204,8 +204,6 @@ func (r *WorkspaceReconciler) actOnStatus(ctx context.Context, workspace *worksp
log.Error(err, "unable to create Pod for Workspace", "pod", pod)
return ctrl.Result{Requeue: true}, err
} else {
// TODO(cw): replicate the startup mechanism where pods can fail to be scheduled,
// need to be deleted and re-created
// Must increment and persist the pod starts, and ensure we retry on conflict.
// If we fail to persist this value, it's possible that the Pod gets recreated
// when the workspace stops, due to PodStarts still being 0 when the original Pod
Expand All @@ -221,6 +219,43 @@ func (r *WorkspaceReconciler) actOnStatus(ctx context.Context, workspace *worksp
r.Recorder.Event(workspace, corev1.EventTypeNormal, "Creating", "")
}

case workspace.Status.Phase == workspacev1.WorkspacePhaseStopped && workspace.IsConditionTrue(workspacev1.WorkspaceConditionPodRejected):
if workspace.Status.PodRecreated > r.Config.PodRecreationMaxRetries {
workspace.Status.SetCondition(workspacev1.NewWorkspaceConditionPodRejected(fmt.Sprintf("Pod reached maximum recreations %d, failing", workspace.Status.PodRecreated), metav1.ConditionFalse))
return ctrl.Result{Requeue: true}, nil // requeue so we end up in the "Stopped" case below
}

// Must persist the modification pod starts, and ensure we retry on conflict.
// If we fail to persist this value, it's possible that the Pod gets recreated endlessly
// when the workspace stops, due to PodStarts still being 0 when the original Pod
// disappears.
// Use a Patch instead of an Update, to prevent conflicts.
patch := client.MergeFrom(workspace.DeepCopy())

// Reset status
sc := workspace.Status.DeepCopy()
workspace.Status = workspacev1.WorkspaceStatus{}
workspace.Status.OwnerToken = sc.OwnerToken
workspace.Status.PodStarts = sc.PodStarts
workspace.Status.PodRecreated = sc.PodRecreated + 1
workspace.Status.SetCondition(workspacev1.NewWorkspaceConditionPodRejected(fmt.Sprintf("Recreating pod... (%d retry)", workspace.Status.PodRecreated), metav1.ConditionFalse))

if err := r.Status().Patch(ctx, workspace, patch); err != nil {
log.Error(err, "Failed to patch workspace status-reset")
return ctrl.Result{}, err
}

// Reset metrics cache
r.metrics.forgetWorkspace(workspace)

requeueAfter := 5 * time.Second
if r.Config.PodRecreationBackoff != 0 {
requeueAfter = time.Duration(r.Config.PodRecreationBackoff)
}

r.Recorder.Event(workspace, corev1.EventTypeNormal, "Recreating", "")
return ctrl.Result{Requeue: true, RequeueAfter: requeueAfter}, nil

case workspace.Status.Phase == workspacev1.WorkspacePhaseStopped:
if err := r.deleteWorkspaceSecrets(ctx, workspace); err != nil {
return ctrl.Result{}, err
Expand Down Expand Up @@ -378,6 +413,11 @@ func (r *WorkspaceReconciler) updateMetrics(ctx context.Context, workspace *work
lastState.recordedStartTime = true
}

if lastState.recordedRecreations < workspace.Status.PodRecreated {
r.metrics.countWorkspaceRecreations(&log, workspace)
lastState.recordedRecreations = workspace.Status.PodRecreated
}

if workspace.Status.Phase == workspacev1.WorkspacePhaseStopped {
r.metrics.countWorkspaceStop(&log, workspace)

Expand All @@ -403,7 +443,9 @@ func isStartFailure(ws *workspacev1.Workspace) bool {
isAborted := ws.IsConditionTrue(workspacev1.WorkspaceConditionAborted)
// Also ignore workspaces that are requested to be stopped before they became ready.
isStoppedByRequest := ws.IsConditionTrue(workspacev1.WorkspaceConditionStoppedByRequest)
return !everReady && !isAborted && !isStoppedByRequest
// Also ignore pods that got rejected by the node
isPodRejected := ws.IsConditionTrue(workspacev1.WorkspaceConditionPodRejected)
return !everReady && !isAborted && !isStoppedByRequest && !isPodRejected
}

func (r *WorkspaceReconciler) emitPhaseEvents(ctx context.Context, ws *workspacev1.Workspace, old *workspacev1.WorkspaceStatus) {
Expand Down
Loading

0 comments on commit c895e84

Please sign in to comment.