From d3b0ed0c66536fca3f02dd7d3d240e9b867990ca Mon Sep 17 00:00:00 2001
From: Gero Posmyk-Leinemann <gero@gitpod.io>
Date: Thu, 24 Oct 2024 15:39:32 +0000
Subject: [PATCH] [ws-daemon] Make sure to cleanup all workspace state for
 rejected pods

---
 .../pkg/controller/workspace_controller.go    | 19 +++++++++++
 .../pkg/controller/workspace_operations.go    | 32 +++++++++++++++++++
 .../controllers/workspace_controller.go       |  1 +
 3 files changed, 52 insertions(+)

diff --git a/components/ws-daemon/pkg/controller/workspace_controller.go b/components/ws-daemon/pkg/controller/workspace_controller.go
index ded5fffa329ac4..ea92cb8cedef11 100644
--- a/components/ws-daemon/pkg/controller/workspace_controller.go
+++ b/components/ws-daemon/pkg/controller/workspace_controller.go
@@ -227,6 +227,25 @@ func (wsc *WorkspaceController) handleWorkspaceStop(ctx context.Context, ws *wor
 	span, ctx := opentracing.StartSpanFromContext(ctx, "handleWorkspaceStop")
 	defer tracing.FinishSpan(span, &err)
 
+	if ws.IsConditionTrue(workspacev1.WorkspaceConditionPodRejected) {
+		// in this case we are not interested in any backups, but instead are concerned with completely wiping all state that might be dangling somewhere
+		log.Info("handling workspace stop - wiping mode")
+		err = wsc.operations.WipeWorkspace(ctx, ws.Name)
+		if err != nil {
+			wsc.emitEvent(ws, "Wiping", fmt.Errorf("failed to wipe workspace: %w", err))
+			return ctrl.Result{}, fmt.Errorf("failed to wipe workspace: %w", err)
+		}
+
+		return ctrl.Result{}, nil
+	}
+
+	// regular case
+	return wsc.doWorkspaceContentBackup(ctx, span, ws, req)
+}
+
+func (wsc *WorkspaceController) doWorkspaceContentBackup(ctx context.Context, span opentracing.Span, ws *workspacev1.Workspace, req ctrl.Request) (result ctrl.Result, err error) {
+	log := log.FromContext(ctx)
+
 	if c := wsk8s.GetCondition(ws.Status.Conditions, string(workspacev1.WorkspaceConditionContentReady)); c == nil || c.Status == metav1.ConditionFalse {
 		return ctrl.Result{}, fmt.Errorf("workspace content was never ready")
 	}
diff --git a/components/ws-daemon/pkg/controller/workspace_operations.go b/components/ws-daemon/pkg/controller/workspace_operations.go
index f55d57a5833197..4a014f98a253bc 100644
--- a/components/ws-daemon/pkg/controller/workspace_operations.go
+++ b/components/ws-daemon/pkg/controller/workspace_operations.go
@@ -66,6 +66,8 @@ type WorkspaceOperations interface {
 	BackupWorkspace(ctx context.Context, opts BackupOptions) (*csapi.GitStatus, error)
 	// DeleteWorkspace deletes the content of the workspace from disk
 	DeleteWorkspace(ctx context.Context, instanceID string) error
+	// WipeWorkspace deletes all references to the workspace. Does not fail if parts are already gone, or state is incosistent.
+	WipeWorkspace(ctx context.Context, instanceID string) error
 	// SnapshotIDs generates the name and url for a snapshot
 	SnapshotIDs(ctx context.Context, instanceID string) (snapshotUrl, snapshotName string, err error)
 	// Snapshot takes a snapshot of the workspace
@@ -285,6 +287,36 @@ func (wso *DefaultWorkspaceOperations) DeleteWorkspace(ctx context.Context, inst
 	return nil
 }
 
+func (wso *DefaultWorkspaceOperations) WipeWorkspace(ctx context.Context, instanceID string) error {
+	ws, err := wso.provider.GetAndConnect(ctx, instanceID)
+	if err != nil {
+		// we have to assume everything is fine, and this workspace has already been completely wiped
+		return nil
+	}
+
+	if err = ws.Dispose(ctx, wso.provider.hooks[session.WorkspaceDisposed]); err != nil {
+		glog.WithError(err).WithFields(ws.OWI()).Error("cannot dispose session")
+		return err
+	}
+
+	// remove workspace daemon directory in the node
+	if err := os.RemoveAll(ws.ServiceLocDaemon); err != nil {
+		glog.WithError(err).WithFields(ws.OWI()).Error("cannot delete workspace daemon directory")
+		return err
+	}
+
+	// remove workspace daemon node directory in the node
+	// TODO(gpl): Is this used at all? Can't find any reference
+	if err := os.RemoveAll(ws.ServiceLocNode); err != nil {
+		glog.WithError(err).WithFields(ws.OWI()).Error("cannot delete workspace daemon node directory")
+		return err
+	}
+
+	wso.provider.Remove(ctx, instanceID)
+
+	return nil
+}
+
 func (wso *DefaultWorkspaceOperations) SnapshotIDs(ctx context.Context, instanceID string) (snapshotUrl, snapshotName string, err error) {
 	sess, err := wso.provider.GetAndConnect(ctx, instanceID)
 	if err != nil {
diff --git a/components/ws-manager-mk2/controllers/workspace_controller.go b/components/ws-manager-mk2/controllers/workspace_controller.go
index 28330450245f2b..ddf5b08b937493 100644
--- a/components/ws-manager-mk2/controllers/workspace_controller.go
+++ b/components/ws-manager-mk2/controllers/workspace_controller.go
@@ -224,6 +224,7 @@ func (r *WorkspaceReconciler) actOnStatus(ctx context.Context, workspace *worksp
 				workspace.Status.SetCondition(workspacev1.NewWorkspaceConditionPodRejected(fmt.Sprintf("Pod reached maximum recreations %d, failing", workspace.Status.PodRecreated), metav1.ConditionFalse))
 				return ctrl.Result{Requeue: true}, nil // requeue so we end up in the "Stopped" case below
 			}
+			log.WithValues("PodStarts", workspace.Status.PodStarts, "PodRecreated", workspace.Status.PodRecreated, "Phase", workspace.Status.Phase).Info("trigger pod recreation")
 
 			// Must persist the modification pod starts, and ensure we retry on conflict.
 			// If we fail to persist this value, it's possible that the Pod gets recreated endlessly