From ba21ee7618edbd2fcd608d4b60ec4c6d4f0a1415 Mon Sep 17 00:00:00 2001
From: Gero Posmyk-Leinemann <gero@gitpod.io>
Date: Tue, 12 Nov 2024 15:22:24 +0000
Subject: [PATCH] review comments + log cleanup

---
 components/ws-daemon/pkg/cgroup/cgroup.go        |  3 ++-
 components/ws-daemon/pkg/container/containerd.go |  4 ++--
 components/ws-daemon/pkg/content/hooks.go        |  1 +
 .../pkg/controller/workspace_controller.go       |  4 ++--
 .../pkg/controller/workspace_operations.go       |  8 +++-----
 components/ws-daemon/pkg/cpulimit/dispatch.go    |  2 +-
 components/ws-daemon/pkg/daemon/markunmount.go   |  3 ++-
 components/ws-daemon/pkg/diskguard/guard.go      |  1 +
 components/ws-daemon/pkg/dispatch/dispatch.go    | 16 +++++-----------
 components/ws-daemon/pkg/iws/iws.go              |  8 +++-----
 components/ws-daemon/pkg/netlimit/netlimit.go    |  5 +++--
 components/ws-daemon/pkg/nsinsider/nsinsider.go  |  1 +
 components/ws-manager-mk2/controllers/status.go  |  1 -
 .../controllers/workspace_controller.go          |  9 +++++----
 14 files changed, 31 insertions(+), 35 deletions(-)

diff --git a/components/ws-daemon/pkg/cgroup/cgroup.go b/components/ws-daemon/pkg/cgroup/cgroup.go
index bce65bbd53448a..899c1064e3bcfd 100644
--- a/components/ws-daemon/pkg/cgroup/cgroup.go
+++ b/components/ws-daemon/pkg/cgroup/cgroup.go
@@ -6,6 +6,7 @@ package cgroup
 
 import (
 	"context"
+	"errors"
 
 	"github.com/gitpod-io/gitpod/common-go/cgroups"
 	"github.com/gitpod-io/gitpod/common-go/log"
@@ -81,7 +82,7 @@ func (host *PluginHost) WorkspaceAdded(ctx context.Context, ws *dispatch.Workspa
 
 	cgroupPath, err := disp.Runtime.ContainerCGroupPath(ctx, ws.ContainerID)
 	if err != nil {
-		if err == context.Canceled {
+		if errors.Is(err, context.Canceled) {
 			return nil
 		}
 		return xerrors.Errorf("cannot get cgroup path for container %s: %w", ws.ContainerID, err)
diff --git a/components/ws-daemon/pkg/container/containerd.go b/components/ws-daemon/pkg/container/containerd.go
index 26e82912914e40..c5df53f6ef3888 100644
--- a/components/ws-daemon/pkg/container/containerd.go
+++ b/components/ws-daemon/pkg/container/containerd.go
@@ -431,8 +431,8 @@ func (s *Containerd) WaitForContainerStop(ctx context.Context, workspaceInstance
 func (s *Containerd) DisposeContainer(ctx context.Context, workspaceInstanceID string) {
 	log := log.WithContext(ctx)
 
-	log.Debug("CONTAINERD: DISPOSING CONTAINER")
-	defer log.Debug("CONTAINERD: DISPOSING CONTAINER DONE")
+	log.Debug("containerd: disposing container")
+	defer log.Debug("containerd: disposing container done")
 
 	s.cond.L.Lock()
 	defer s.cond.L.Unlock()
diff --git a/components/ws-daemon/pkg/content/hooks.go b/components/ws-daemon/pkg/content/hooks.go
index 22358539958eff..3662097c7d9567 100644
--- a/components/ws-daemon/pkg/content/hooks.go
+++ b/components/ws-daemon/pkg/content/hooks.go
@@ -176,6 +176,7 @@ func hookWipingTeardown() session.WorkspaceLivecycleHook {
 
 		if !ws.DoWipe {
 			// this is the "default" case for 99% of all workspaces
+			// TODO(gpl): We should probably make this the default for all workspaces - but not with this PR
 			return nil
 		}
 
diff --git a/components/ws-daemon/pkg/controller/workspace_controller.go b/components/ws-daemon/pkg/controller/workspace_controller.go
index ae6991d68c4daf..59fb50c1dcf986 100644
--- a/components/ws-daemon/pkg/controller/workspace_controller.go
+++ b/components/ws-daemon/pkg/controller/workspace_controller.go
@@ -255,13 +255,13 @@ func (wsc *WorkspaceController) doWipeWorkspace(ctx context.Context, ws *workspa
 		return ctrl.Result{Requeue: true, RequeueAfter: 100 * time.Millisecond}, nil
 	}
 
-	setStateWipedCondition := func(s bool) {
+	setStateWipedCondition := func(success bool) {
 		err := retry.RetryOnConflict(retryParams, func() error {
 			if err := wsc.Get(ctx, req.NamespacedName, ws); err != nil {
 				return err
 			}
 
-			if s {
+			if success {
 				ws.Status.SetCondition(workspacev1.NewWorkspaceConditionStateWiped("", metav1.ConditionTrue))
 			} else {
 				ws.Status.SetCondition(workspacev1.NewWorkspaceConditionStateWiped("", metav1.ConditionFalse))
diff --git a/components/ws-daemon/pkg/controller/workspace_operations.go b/components/ws-daemon/pkg/controller/workspace_operations.go
index 0010f1d4551b66..f2287519553fc9 100644
--- a/components/ws-daemon/pkg/controller/workspace_operations.go
+++ b/components/ws-daemon/pkg/controller/workspace_operations.go
@@ -315,9 +315,6 @@ func (wso *DefaultWorkspaceOperations) WipeWorkspace(ctx context.Context, instan
 	wso.dispatch.DisposeWorkspace(ctx, instanceID)
 
 	// remove workspace daemon directory in the node
-	log.Debug("DELETING WORKSPACE DAEMON DIR")
-	defer log.Debug("DELETING WORKSPACE DAEMON DIR DONE")
-
 	removedChan := make(chan struct{}, 1)
 	go func() {
 		defer close(removedChan)
@@ -327,9 +324,10 @@ func (wso *DefaultWorkspaceOperations) WipeWorkspace(ctx context.Context, instan
 		}
 	}()
 
-	timeoutT := time.NewTicker(10 * time.Second)
+	timeout := time.NewTicker(10 * time.Second)
+	defer timeout.Stop()
 	select {
-	case <-timeoutT.C:
+	case <-timeout.C:
 	case <-removedChan:
 		log.Debug("successfully removed workspace daemon directory")
 	}
diff --git a/components/ws-daemon/pkg/cpulimit/dispatch.go b/components/ws-daemon/pkg/cpulimit/dispatch.go
index 44b456ad171166..8020d7b9bb5d78 100644
--- a/components/ws-daemon/pkg/cpulimit/dispatch.go
+++ b/components/ws-daemon/pkg/cpulimit/dispatch.go
@@ -181,7 +181,7 @@ func (d *DispatchListener) WorkspaceAdded(ctx context.Context, ws *dispatch.Work
 
 	cgroupPath, err := disp.Runtime.ContainerCGroupPath(ctx, ws.ContainerID)
 	if err != nil {
-		if dispatch.IsCancelled(ctx) {
+		if errors.Is(err, context.Canceled) {
 			return nil
 		}
 		return xerrors.Errorf("cannot start governer: %w", err)
diff --git a/components/ws-daemon/pkg/daemon/markunmount.go b/components/ws-daemon/pkg/daemon/markunmount.go
index 2e647f81e1cf1b..30fa029a5066c9 100644
--- a/components/ws-daemon/pkg/daemon/markunmount.go
+++ b/components/ws-daemon/pkg/daemon/markunmount.go
@@ -8,6 +8,7 @@ import (
 	"bufio"
 	"bytes"
 	"context"
+	"errors"
 	"io/ioutil"
 	"path/filepath"
 	"strings"
@@ -127,7 +128,7 @@ func (c *MarkUnmountFallback) WorkspaceUpdated(ctx context.Context, ws *dispatch
 		}
 
 		err := unmountMark(ws.InstanceID)
-		if err != nil && !dispatch.IsCancelled(ctx) {
+		if err != nil && errors.Is(err, context.Canceled) {
 			log.WithFields(ws.OWI()).WithError(err).Error("cannot unmount mark mount from within ws-daemon")
 			c.activityCounter.WithLabelValues("false").Inc()
 		} else {
diff --git a/components/ws-daemon/pkg/diskguard/guard.go b/components/ws-daemon/pkg/diskguard/guard.go
index fe21543e8df365..92183c15644860 100644
--- a/components/ws-daemon/pkg/diskguard/guard.go
+++ b/components/ws-daemon/pkg/diskguard/guard.go
@@ -71,6 +71,7 @@ type Guard struct {
 // Start starts the disk guard
 func (g *Guard) Start() {
 	t := time.NewTicker(g.Interval)
+	defer t.Stop()
 	for {
 		bvail, err := getAvailableBytes(g.Path)
 		if err != nil {
diff --git a/components/ws-daemon/pkg/dispatch/dispatch.go b/components/ws-daemon/pkg/dispatch/dispatch.go
index 6460afb71f06e7..4cb18a5a50fea1 100644
--- a/components/ws-daemon/pkg/dispatch/dispatch.go
+++ b/components/ws-daemon/pkg/dispatch/dispatch.go
@@ -115,10 +115,6 @@ func GetDispatchWaitGroup(ctx context.Context) *sync.WaitGroup {
 	return ctx.Value(contextDispatchWaitGroup).(*sync.WaitGroup)
 }
 
-func IsCancelled(ctx context.Context) bool {
-	return context.Cause(ctx) != nil
-}
-
 // Start starts the dispatch
 func (d *Dispatch) Start() error {
 	ifac := informers.NewSharedInformerFactoryWithOptions(d.Kubernetes, podInformerResyncInterval, informers.WithNamespace(d.KubernetesNamespace))
@@ -195,8 +191,8 @@ func (d *Dispatch) DisposeWorkspace(ctx context.Context, instanceID string) {
 	d.mu.Lock()
 	defer d.mu.Unlock()
 
-	log.WithField("instanceID", instanceID).Debug("WS DISPOSE")
-	defer log.WithField("instanceID", instanceID).Debug("WS DISPOSE DONE")
+	log.WithField("instanceID", instanceID).Debug("disposing workspace")
+	defer log.WithField("instanceID", instanceID).Debug("disposing workspace done")
 
 	// If we have that instanceID present, cancel it's context
 	state, present := d.ctxs[instanceID]
@@ -237,10 +233,9 @@ func (d *Dispatch) handlePodUpdate(oldPod, newPod *corev1.Pod) {
 	}
 	disposedKey := disposedKey(workspaceInstanceID, newPod)
 	if _, alreadyDisposed := d.disposedCtxs[disposedKey]; alreadyDisposed {
-		log.WithField("disposedKey", disposedKey).Debug("DROPPING POD UPDATE FOR DISPOSED POD")
+		log.WithField("disposedKey", disposedKey).Debug("dropping pod update for disposed pod")
 		return
 	}
-	log.WithField("instanceID", workspaceInstanceID).Debugf("POD UPDATE: %s", workspaceInstanceID)
 
 	d.mu.Lock()
 	defer d.mu.Unlock()
@@ -337,7 +332,6 @@ func (d *Dispatch) handlePodUpdate(oldPod, newPod *corev1.Pod) {
 			}
 		}()
 	}
-	log.WithField("instanceID", workspaceInstanceID).Debugf("POD UPDATE DONE: %s", workspaceInstanceID)
 }
 
 func (d *Dispatch) handlePodDeleted(pod *corev1.Pod) {
@@ -345,7 +339,8 @@ func (d *Dispatch) handlePodDeleted(pod *corev1.Pod) {
 	if !ok {
 		return
 	}
-	log.WithField("instanceID", instanceID).Debugf("POD DELETED: %s", instanceID)
+	log.WithField("instanceID", instanceID).Debug("pod deleted")
+	defer log.WithField("instanceID", instanceID).Debug("pod deleted done")
 
 	d.mu.Lock()
 	defer d.mu.Unlock()
@@ -361,5 +356,4 @@ func (d *Dispatch) handlePodDeleted(pod *corev1.Pod) {
 
 	delete(d.ctxs, instanceID)
 
-	log.WithField("instanceID", instanceID).Debugf("POD DELETED DONE: %s", instanceID)
 }
diff --git a/components/ws-daemon/pkg/iws/iws.go b/components/ws-daemon/pkg/iws/iws.go
index f444fbfed9c14c..a2916cb909cff6 100644
--- a/components/ws-daemon/pkg/iws/iws.go
+++ b/components/ws-daemon/pkg/iws/iws.go
@@ -961,9 +961,6 @@ func (wbs *InWorkspaceServiceServer) Teardown(ctx context.Context, req *api.Tear
 	owi := wbs.Session.OWI()
 	log := log.WithFields(owi)
 
-	log.Debug("TEARDOWN")
-	defer log.Debug("TEARDOWN DONE")
-
 	var (
 		success = true
 		err     error
@@ -981,8 +978,9 @@ func (wbs *InWorkspaceServiceServer) Teardown(ctx context.Context, req *api.Tear
 // WipingTeardown tears down every state we created using IWS
 func (wbs *InWorkspaceServiceServer) WipingTeardown(ctx context.Context, req *api.WipingTeardownRequest) (*api.WipingTeardownResponse, error) {
 	log := log.WithFields(wbs.Session.OWI())
-	log.WithField("doWipe", req.DoWipe).Debug("WIPING TEARDOWN")
-	defer log.WithField("doWipe", req.DoWipe).Debug("WIPING TEARDOWN DONE")
+	log.WithField("doWipe", req.DoWipe).Debug("iws.WipingTeardown")
+	defer log.WithField("doWipe", req.DoWipe).Debug("iws.WipingTeardown done")
+
 	if !req.DoWipe {
 		return &api.WipingTeardownResponse{Success: true}, nil
 	}
diff --git a/components/ws-daemon/pkg/netlimit/netlimit.go b/components/ws-daemon/pkg/netlimit/netlimit.go
index 3fc26ef79065d0..cfe29ca86b79c4 100644
--- a/components/ws-daemon/pkg/netlimit/netlimit.go
+++ b/components/ws-daemon/pkg/netlimit/netlimit.go
@@ -6,6 +6,7 @@ package netlimit
 
 import (
 	"context"
+	"errors"
 	"fmt"
 	"os"
 	"os/exec"
@@ -130,7 +131,7 @@ func (c *ConnLimiter) limitWorkspace(ctx context.Context, ws *dispatch.Workspace
 
 	pid, err := disp.Runtime.ContainerPID(ctx, ws.ContainerID)
 	if err != nil {
-		if dispatch.IsCancelled(ctx) {
+		if errors.Is(err, context.Canceled) {
 			return nil
 		}
 		return fmt.Errorf("could not get pid for container %s of workspace %s", ws.ContainerID, ws.WorkspaceID)
@@ -144,7 +145,7 @@ func (c *ConnLimiter) limitWorkspace(ctx context.Context, ws *dispatch.Workspace
 		}
 	}, nsinsider.EnterMountNS(false), nsinsider.EnterNetNS(true))
 	if err != nil {
-		if dispatch.IsCancelled(ctx) {
+		if errors.Is(err, context.Canceled) {
 			return nil
 		}
 		log.WithError(err).WithFields(ws.OWI()).Error("cannot enable connection limiting")
diff --git a/components/ws-daemon/pkg/nsinsider/nsinsider.go b/components/ws-daemon/pkg/nsinsider/nsinsider.go
index 0b7d3473be3bf9..53951490f938e8 100644
--- a/components/ws-daemon/pkg/nsinsider/nsinsider.go
+++ b/components/ws-daemon/pkg/nsinsider/nsinsider.go
@@ -110,6 +110,7 @@ func Nsinsider(instanceID string, targetPid int, mod func(*exec.Cmd), opts ...ns
 	_, _ = io.Copy(os.Stderr, &cmdErr)
 	log.FromBuffer(&cmdOut, log.WithFields(log.OWI("", "", instanceID)))
 	if err != nil {
+		// writing stderr to the error so clients can pattern match on specific errors
 		return fmt.Errorf("run nsinsider (%v) failed: %q \\ %q\n%v",
 			cmd.Args,
 			cmdOut.String(),
diff --git a/components/ws-manager-mk2/controllers/status.go b/components/ws-manager-mk2/controllers/status.go
index 4b12bf981e1987..9af7ae996fb151 100644
--- a/components/ws-manager-mk2/controllers/status.go
+++ b/components/ws-manager-mk2/controllers/status.go
@@ -72,7 +72,6 @@ func (r *WorkspaceReconciler) updateWorkspaceStatus(ctx context.Context, workspa
 			workspace.Status.Phase = workspacev1.WorkspacePhaseStopped
 		}
 
-		log.WithValues("podDeletionTime", workspace.Status.PodDeletionTime).Info("PodDeletionTimeValue")
 		if workspace.Status.Phase == workspacev1.WorkspacePhaseStopped && workspace.Status.PodDeletionTime == nil {
 			// Set the timestamp when we first saw the pod as deleted.
 			// This is used for the delaying eventual pod restarts
diff --git a/components/ws-manager-mk2/controllers/workspace_controller.go b/components/ws-manager-mk2/controllers/workspace_controller.go
index 68c00f8f5ce67d..791912f00159a5 100644
--- a/components/ws-manager-mk2/controllers/workspace_controller.go
+++ b/components/ws-manager-mk2/controllers/workspace_controller.go
@@ -224,22 +224,23 @@ func (r *WorkspaceReconciler) actOnStatus(ctx context.Context, workspace *worksp
 				workspace.Status.SetCondition(workspacev1.NewWorkspaceConditionPodRejected(fmt.Sprintf("Pod reached maximum recreations %d, failing", workspace.Status.PodRecreated), metav1.ConditionFalse))
 				return ctrl.Result{Requeue: true}, nil // requeue so we end up in the "Stopped" case below
 			}
-			log.WithValues("PodStarts", workspace.Status.PodStarts, "PodRecreated", workspace.Status.PodRecreated, "Phase", workspace.Status.Phase).Info("trigger pod recreation")
+			log = log.WithValues("PodStarts", workspace.Status.PodStarts, "PodRecreated", workspace.Status.PodRecreated, "Phase", workspace.Status.Phase)
 
 			// Make sure to wait for "recreationTimeout" before creating the pod again
 			if workspace.Status.PodDeletionTime == nil {
-				log.Info("want to wait for pod recreation timeout, but podDeletionTime not set (yet)")
+				log.Info("pod recreation: waiting for pod deletion time to be populated...")
 				return ctrl.Result{Requeue: true, RequeueAfter: 5 * time.Second}, nil
 			}
 
 			recreationTimeout := r.podRecreationTimeout()
 			podDeletionTime := workspace.Status.PodDeletionTime.Time
 			waitTime := time.Until(podDeletionTime.Add(recreationTimeout))
+			log = log.WithValues("waitTime", waitTime.String(), "recreationTimeout", recreationTimeout.String(), "podDeletionTime", podDeletionTime.String())
 			if waitTime > 0 {
-				log.WithValues("waitTime", waitTime).Info("waiting for pod recreation timeout")
+				log.Info("pod recreation: waiting for timeout...")
 				return ctrl.Result{Requeue: true, RequeueAfter: waitTime}, nil
 			}
-			log.WithValues("waitedTime", waitTime.Abs().String()).Info("waited for pod recreation timeout")
+			log.Info("trigger pod recreation")
 
 			// Must persist the modification pod starts, and ensure we retry on conflict.
 			// If we fail to persist this value, it's possible that the Pod gets recreated endlessly