From d3b0ed0c66536fca3f02dd7d3d240e9b867990ca Mon Sep 17 00:00:00 2001 From: Gero Posmyk-Leinemann Date: Thu, 24 Oct 2024 15:39:32 +0000 Subject: [PATCH] [ws-daemon] Make sure to cleanup all workspace state for rejected pods --- .../pkg/controller/workspace_controller.go | 19 +++++++++++ .../pkg/controller/workspace_operations.go | 32 +++++++++++++++++++ .../controllers/workspace_controller.go | 1 + 3 files changed, 52 insertions(+) diff --git a/components/ws-daemon/pkg/controller/workspace_controller.go b/components/ws-daemon/pkg/controller/workspace_controller.go index ded5fffa329ac4..ea92cb8cedef11 100644 --- a/components/ws-daemon/pkg/controller/workspace_controller.go +++ b/components/ws-daemon/pkg/controller/workspace_controller.go @@ -227,6 +227,25 @@ func (wsc *WorkspaceController) handleWorkspaceStop(ctx context.Context, ws *wor span, ctx := opentracing.StartSpanFromContext(ctx, "handleWorkspaceStop") defer tracing.FinishSpan(span, &err) + if ws.IsConditionTrue(workspacev1.WorkspaceConditionPodRejected) { + // in this case we are not interested in any backups, but instead are concerned with completely wiping all state that might be dangling somewhere + log.Info("handling workspace stop - wiping mode") + err = wsc.operations.WipeWorkspace(ctx, ws.Name) + if err != nil { + wsc.emitEvent(ws, "Wiping", fmt.Errorf("failed to wipe workspace: %w", err)) + return ctrl.Result{}, fmt.Errorf("failed to wipe workspace: %w", err) + } + + return ctrl.Result{}, nil + } + + // regular case + return wsc.doWorkspaceContentBackup(ctx, span, ws, req) +} + +func (wsc *WorkspaceController) doWorkspaceContentBackup(ctx context.Context, span opentracing.Span, ws *workspacev1.Workspace, req ctrl.Request) (result ctrl.Result, err error) { + log := log.FromContext(ctx) + if c := wsk8s.GetCondition(ws.Status.Conditions, string(workspacev1.WorkspaceConditionContentReady)); c == nil || c.Status == metav1.ConditionFalse { return ctrl.Result{}, fmt.Errorf("workspace content was never ready") } diff --git a/components/ws-daemon/pkg/controller/workspace_operations.go b/components/ws-daemon/pkg/controller/workspace_operations.go index f55d57a5833197..4a014f98a253bc 100644 --- a/components/ws-daemon/pkg/controller/workspace_operations.go +++ b/components/ws-daemon/pkg/controller/workspace_operations.go @@ -66,6 +66,8 @@ type WorkspaceOperations interface { BackupWorkspace(ctx context.Context, opts BackupOptions) (*csapi.GitStatus, error) // DeleteWorkspace deletes the content of the workspace from disk DeleteWorkspace(ctx context.Context, instanceID string) error + // WipeWorkspace deletes all references to the workspace. Does not fail if parts are already gone, or state is incosistent. + WipeWorkspace(ctx context.Context, instanceID string) error // SnapshotIDs generates the name and url for a snapshot SnapshotIDs(ctx context.Context, instanceID string) (snapshotUrl, snapshotName string, err error) // Snapshot takes a snapshot of the workspace @@ -285,6 +287,36 @@ func (wso *DefaultWorkspaceOperations) DeleteWorkspace(ctx context.Context, inst return nil } +func (wso *DefaultWorkspaceOperations) WipeWorkspace(ctx context.Context, instanceID string) error { + ws, err := wso.provider.GetAndConnect(ctx, instanceID) + if err != nil { + // we have to assume everything is fine, and this workspace has already been completely wiped + return nil + } + + if err = ws.Dispose(ctx, wso.provider.hooks[session.WorkspaceDisposed]); err != nil { + glog.WithError(err).WithFields(ws.OWI()).Error("cannot dispose session") + return err + } + + // remove workspace daemon directory in the node + if err := os.RemoveAll(ws.ServiceLocDaemon); err != nil { + glog.WithError(err).WithFields(ws.OWI()).Error("cannot delete workspace daemon directory") + return err + } + + // remove workspace daemon node directory in the node + // TODO(gpl): Is this used at all? Can't find any reference + if err := os.RemoveAll(ws.ServiceLocNode); err != nil { + glog.WithError(err).WithFields(ws.OWI()).Error("cannot delete workspace daemon node directory") + return err + } + + wso.provider.Remove(ctx, instanceID) + + return nil +} + func (wso *DefaultWorkspaceOperations) SnapshotIDs(ctx context.Context, instanceID string) (snapshotUrl, snapshotName string, err error) { sess, err := wso.provider.GetAndConnect(ctx, instanceID) if err != nil { diff --git a/components/ws-manager-mk2/controllers/workspace_controller.go b/components/ws-manager-mk2/controllers/workspace_controller.go index 28330450245f2b..ddf5b08b937493 100644 --- a/components/ws-manager-mk2/controllers/workspace_controller.go +++ b/components/ws-manager-mk2/controllers/workspace_controller.go @@ -224,6 +224,7 @@ func (r *WorkspaceReconciler) actOnStatus(ctx context.Context, workspace *worksp workspace.Status.SetCondition(workspacev1.NewWorkspaceConditionPodRejected(fmt.Sprintf("Pod reached maximum recreations %d, failing", workspace.Status.PodRecreated), metav1.ConditionFalse)) return ctrl.Result{Requeue: true}, nil // requeue so we end up in the "Stopped" case below } + log.WithValues("PodStarts", workspace.Status.PodStarts, "PodRecreated", workspace.Status.PodRecreated, "Phase", workspace.Status.Phase).Info("trigger pod recreation") // Must persist the modification pod starts, and ensure we retry on conflict. // If we fail to persist this value, it's possible that the Pod gets recreated endlessly