Skip to content

Commit

Permalink
[ws-daemon] Make sure to cleanup all workspace state for rejected pods
Browse files Browse the repository at this point in the history
  • Loading branch information
geropl committed Oct 25, 2024
1 parent ac1c86b commit d3b0ed0
Show file tree
Hide file tree
Showing 3 changed files with 52 additions and 0 deletions.
19 changes: 19 additions & 0 deletions components/ws-daemon/pkg/controller/workspace_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -227,6 +227,25 @@ func (wsc *WorkspaceController) handleWorkspaceStop(ctx context.Context, ws *wor
span, ctx := opentracing.StartSpanFromContext(ctx, "handleWorkspaceStop")
defer tracing.FinishSpan(span, &err)

if ws.IsConditionTrue(workspacev1.WorkspaceConditionPodRejected) {
// in this case we are not interested in any backups, but instead are concerned with completely wiping all state that might be dangling somewhere
log.Info("handling workspace stop - wiping mode")
err = wsc.operations.WipeWorkspace(ctx, ws.Name)
if err != nil {
wsc.emitEvent(ws, "Wiping", fmt.Errorf("failed to wipe workspace: %w", err))
return ctrl.Result{}, fmt.Errorf("failed to wipe workspace: %w", err)
}

return ctrl.Result{}, nil
}

// regular case
return wsc.doWorkspaceContentBackup(ctx, span, ws, req)
}

func (wsc *WorkspaceController) doWorkspaceContentBackup(ctx context.Context, span opentracing.Span, ws *workspacev1.Workspace, req ctrl.Request) (result ctrl.Result, err error) {
log := log.FromContext(ctx)

if c := wsk8s.GetCondition(ws.Status.Conditions, string(workspacev1.WorkspaceConditionContentReady)); c == nil || c.Status == metav1.ConditionFalse {
return ctrl.Result{}, fmt.Errorf("workspace content was never ready")
}
Expand Down
32 changes: 32 additions & 0 deletions components/ws-daemon/pkg/controller/workspace_operations.go
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,8 @@ type WorkspaceOperations interface {
BackupWorkspace(ctx context.Context, opts BackupOptions) (*csapi.GitStatus, error)
// DeleteWorkspace deletes the content of the workspace from disk
DeleteWorkspace(ctx context.Context, instanceID string) error
// WipeWorkspace deletes all references to the workspace. Does not fail if parts are already gone, or state is incosistent.
WipeWorkspace(ctx context.Context, instanceID string) error
// SnapshotIDs generates the name and url for a snapshot
SnapshotIDs(ctx context.Context, instanceID string) (snapshotUrl, snapshotName string, err error)
// Snapshot takes a snapshot of the workspace
Expand Down Expand Up @@ -285,6 +287,36 @@ func (wso *DefaultWorkspaceOperations) DeleteWorkspace(ctx context.Context, inst
return nil
}

func (wso *DefaultWorkspaceOperations) WipeWorkspace(ctx context.Context, instanceID string) error {
ws, err := wso.provider.GetAndConnect(ctx, instanceID)
if err != nil {
// we have to assume everything is fine, and this workspace has already been completely wiped
return nil
}

if err = ws.Dispose(ctx, wso.provider.hooks[session.WorkspaceDisposed]); err != nil {
glog.WithError(err).WithFields(ws.OWI()).Error("cannot dispose session")
return err
}

// remove workspace daemon directory in the node
if err := os.RemoveAll(ws.ServiceLocDaemon); err != nil {
glog.WithError(err).WithFields(ws.OWI()).Error("cannot delete workspace daemon directory")
return err
}

// remove workspace daemon node directory in the node
// TODO(gpl): Is this used at all? Can't find any reference
if err := os.RemoveAll(ws.ServiceLocNode); err != nil {
glog.WithError(err).WithFields(ws.OWI()).Error("cannot delete workspace daemon node directory")
return err
}

wso.provider.Remove(ctx, instanceID)

return nil
}

func (wso *DefaultWorkspaceOperations) SnapshotIDs(ctx context.Context, instanceID string) (snapshotUrl, snapshotName string, err error) {
sess, err := wso.provider.GetAndConnect(ctx, instanceID)
if err != nil {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -224,6 +224,7 @@ func (r *WorkspaceReconciler) actOnStatus(ctx context.Context, workspace *worksp
workspace.Status.SetCondition(workspacev1.NewWorkspaceConditionPodRejected(fmt.Sprintf("Pod reached maximum recreations %d, failing", workspace.Status.PodRecreated), metav1.ConditionFalse))
return ctrl.Result{Requeue: true}, nil // requeue so we end up in the "Stopped" case below
}
log.WithValues("PodStarts", workspace.Status.PodStarts, "PodRecreated", workspace.Status.PodRecreated, "Phase", workspace.Status.Phase).Info("trigger pod recreation")

// Must persist the modification pod starts, and ensure we retry on conflict.
// If we fail to persist this value, it's possible that the Pod gets recreated endlessly
Expand Down

0 comments on commit d3b0ed0

Please sign in to comment.