From ba21ee7618edbd2fcd608d4b60ec4c6d4f0a1415 Mon Sep 17 00:00:00 2001 From: Gero Posmyk-Leinemann Date: Tue, 12 Nov 2024 15:22:24 +0000 Subject: [PATCH] review comments + log cleanup --- components/ws-daemon/pkg/cgroup/cgroup.go | 3 ++- components/ws-daemon/pkg/container/containerd.go | 4 ++-- components/ws-daemon/pkg/content/hooks.go | 1 + .../pkg/controller/workspace_controller.go | 4 ++-- .../pkg/controller/workspace_operations.go | 8 +++----- components/ws-daemon/pkg/cpulimit/dispatch.go | 2 +- components/ws-daemon/pkg/daemon/markunmount.go | 3 ++- components/ws-daemon/pkg/diskguard/guard.go | 1 + components/ws-daemon/pkg/dispatch/dispatch.go | 16 +++++----------- components/ws-daemon/pkg/iws/iws.go | 8 +++----- components/ws-daemon/pkg/netlimit/netlimit.go | 5 +++-- components/ws-daemon/pkg/nsinsider/nsinsider.go | 1 + components/ws-manager-mk2/controllers/status.go | 1 - .../controllers/workspace_controller.go | 9 +++++---- 14 files changed, 31 insertions(+), 35 deletions(-) diff --git a/components/ws-daemon/pkg/cgroup/cgroup.go b/components/ws-daemon/pkg/cgroup/cgroup.go index bce65bbd53448a..899c1064e3bcfd 100644 --- a/components/ws-daemon/pkg/cgroup/cgroup.go +++ b/components/ws-daemon/pkg/cgroup/cgroup.go @@ -6,6 +6,7 @@ package cgroup import ( "context" + "errors" "github.com/gitpod-io/gitpod/common-go/cgroups" "github.com/gitpod-io/gitpod/common-go/log" @@ -81,7 +82,7 @@ func (host *PluginHost) WorkspaceAdded(ctx context.Context, ws *dispatch.Workspa cgroupPath, err := disp.Runtime.ContainerCGroupPath(ctx, ws.ContainerID) if err != nil { - if err == context.Canceled { + if errors.Is(err, context.Canceled) { return nil } return xerrors.Errorf("cannot get cgroup path for container %s: %w", ws.ContainerID, err) diff --git a/components/ws-daemon/pkg/container/containerd.go b/components/ws-daemon/pkg/container/containerd.go index 26e82912914e40..c5df53f6ef3888 100644 --- a/components/ws-daemon/pkg/container/containerd.go +++ b/components/ws-daemon/pkg/container/containerd.go @@ -431,8 +431,8 @@ func (s *Containerd) WaitForContainerStop(ctx context.Context, workspaceInstance func (s *Containerd) DisposeContainer(ctx context.Context, workspaceInstanceID string) { log := log.WithContext(ctx) - log.Debug("CONTAINERD: DISPOSING CONTAINER") - defer log.Debug("CONTAINERD: DISPOSING CONTAINER DONE") + log.Debug("containerd: disposing container") + defer log.Debug("containerd: disposing container done") s.cond.L.Lock() defer s.cond.L.Unlock() diff --git a/components/ws-daemon/pkg/content/hooks.go b/components/ws-daemon/pkg/content/hooks.go index 22358539958eff..3662097c7d9567 100644 --- a/components/ws-daemon/pkg/content/hooks.go +++ b/components/ws-daemon/pkg/content/hooks.go @@ -176,6 +176,7 @@ func hookWipingTeardown() session.WorkspaceLivecycleHook { if !ws.DoWipe { // this is the "default" case for 99% of all workspaces + // TODO(gpl): We should probably make this the default for all workspaces - but not with this PR return nil } diff --git a/components/ws-daemon/pkg/controller/workspace_controller.go b/components/ws-daemon/pkg/controller/workspace_controller.go index ae6991d68c4daf..59fb50c1dcf986 100644 --- a/components/ws-daemon/pkg/controller/workspace_controller.go +++ b/components/ws-daemon/pkg/controller/workspace_controller.go @@ -255,13 +255,13 @@ func (wsc *WorkspaceController) doWipeWorkspace(ctx context.Context, ws *workspa return ctrl.Result{Requeue: true, RequeueAfter: 100 * time.Millisecond}, nil } - setStateWipedCondition := func(s bool) { + setStateWipedCondition := func(success bool) { err := retry.RetryOnConflict(retryParams, func() error { if err := wsc.Get(ctx, req.NamespacedName, ws); err != nil { return err } - if s { + if success { ws.Status.SetCondition(workspacev1.NewWorkspaceConditionStateWiped("", metav1.ConditionTrue)) } else { ws.Status.SetCondition(workspacev1.NewWorkspaceConditionStateWiped("", metav1.ConditionFalse)) diff --git a/components/ws-daemon/pkg/controller/workspace_operations.go b/components/ws-daemon/pkg/controller/workspace_operations.go index 0010f1d4551b66..f2287519553fc9 100644 --- a/components/ws-daemon/pkg/controller/workspace_operations.go +++ b/components/ws-daemon/pkg/controller/workspace_operations.go @@ -315,9 +315,6 @@ func (wso *DefaultWorkspaceOperations) WipeWorkspace(ctx context.Context, instan wso.dispatch.DisposeWorkspace(ctx, instanceID) // remove workspace daemon directory in the node - log.Debug("DELETING WORKSPACE DAEMON DIR") - defer log.Debug("DELETING WORKSPACE DAEMON DIR DONE") - removedChan := make(chan struct{}, 1) go func() { defer close(removedChan) @@ -327,9 +324,10 @@ func (wso *DefaultWorkspaceOperations) WipeWorkspace(ctx context.Context, instan } }() - timeoutT := time.NewTicker(10 * time.Second) + timeout := time.NewTicker(10 * time.Second) + defer timeout.Stop() select { - case <-timeoutT.C: + case <-timeout.C: case <-removedChan: log.Debug("successfully removed workspace daemon directory") } diff --git a/components/ws-daemon/pkg/cpulimit/dispatch.go b/components/ws-daemon/pkg/cpulimit/dispatch.go index 44b456ad171166..8020d7b9bb5d78 100644 --- a/components/ws-daemon/pkg/cpulimit/dispatch.go +++ b/components/ws-daemon/pkg/cpulimit/dispatch.go @@ -181,7 +181,7 @@ func (d *DispatchListener) WorkspaceAdded(ctx context.Context, ws *dispatch.Work cgroupPath, err := disp.Runtime.ContainerCGroupPath(ctx, ws.ContainerID) if err != nil { - if dispatch.IsCancelled(ctx) { + if errors.Is(err, context.Canceled) { return nil } return xerrors.Errorf("cannot start governer: %w", err) diff --git a/components/ws-daemon/pkg/daemon/markunmount.go b/components/ws-daemon/pkg/daemon/markunmount.go index 2e647f81e1cf1b..30fa029a5066c9 100644 --- a/components/ws-daemon/pkg/daemon/markunmount.go +++ b/components/ws-daemon/pkg/daemon/markunmount.go @@ -8,6 +8,7 @@ import ( "bufio" "bytes" "context" + "errors" "io/ioutil" "path/filepath" "strings" @@ -127,7 +128,7 @@ func (c *MarkUnmountFallback) WorkspaceUpdated(ctx context.Context, ws *dispatch } err := unmountMark(ws.InstanceID) - if err != nil && !dispatch.IsCancelled(ctx) { + if err != nil && errors.Is(err, context.Canceled) { log.WithFields(ws.OWI()).WithError(err).Error("cannot unmount mark mount from within ws-daemon") c.activityCounter.WithLabelValues("false").Inc() } else { diff --git a/components/ws-daemon/pkg/diskguard/guard.go b/components/ws-daemon/pkg/diskguard/guard.go index fe21543e8df365..92183c15644860 100644 --- a/components/ws-daemon/pkg/diskguard/guard.go +++ b/components/ws-daemon/pkg/diskguard/guard.go @@ -71,6 +71,7 @@ type Guard struct { // Start starts the disk guard func (g *Guard) Start() { t := time.NewTicker(g.Interval) + defer t.Stop() for { bvail, err := getAvailableBytes(g.Path) if err != nil { diff --git a/components/ws-daemon/pkg/dispatch/dispatch.go b/components/ws-daemon/pkg/dispatch/dispatch.go index 6460afb71f06e7..4cb18a5a50fea1 100644 --- a/components/ws-daemon/pkg/dispatch/dispatch.go +++ b/components/ws-daemon/pkg/dispatch/dispatch.go @@ -115,10 +115,6 @@ func GetDispatchWaitGroup(ctx context.Context) *sync.WaitGroup { return ctx.Value(contextDispatchWaitGroup).(*sync.WaitGroup) } -func IsCancelled(ctx context.Context) bool { - return context.Cause(ctx) != nil -} - // Start starts the dispatch func (d *Dispatch) Start() error { ifac := informers.NewSharedInformerFactoryWithOptions(d.Kubernetes, podInformerResyncInterval, informers.WithNamespace(d.KubernetesNamespace)) @@ -195,8 +191,8 @@ func (d *Dispatch) DisposeWorkspace(ctx context.Context, instanceID string) { d.mu.Lock() defer d.mu.Unlock() - log.WithField("instanceID", instanceID).Debug("WS DISPOSE") - defer log.WithField("instanceID", instanceID).Debug("WS DISPOSE DONE") + log.WithField("instanceID", instanceID).Debug("disposing workspace") + defer log.WithField("instanceID", instanceID).Debug("disposing workspace done") // If we have that instanceID present, cancel it's context state, present := d.ctxs[instanceID] @@ -237,10 +233,9 @@ func (d *Dispatch) handlePodUpdate(oldPod, newPod *corev1.Pod) { } disposedKey := disposedKey(workspaceInstanceID, newPod) if _, alreadyDisposed := d.disposedCtxs[disposedKey]; alreadyDisposed { - log.WithField("disposedKey", disposedKey).Debug("DROPPING POD UPDATE FOR DISPOSED POD") + log.WithField("disposedKey", disposedKey).Debug("dropping pod update for disposed pod") return } - log.WithField("instanceID", workspaceInstanceID).Debugf("POD UPDATE: %s", workspaceInstanceID) d.mu.Lock() defer d.mu.Unlock() @@ -337,7 +332,6 @@ func (d *Dispatch) handlePodUpdate(oldPod, newPod *corev1.Pod) { } }() } - log.WithField("instanceID", workspaceInstanceID).Debugf("POD UPDATE DONE: %s", workspaceInstanceID) } func (d *Dispatch) handlePodDeleted(pod *corev1.Pod) { @@ -345,7 +339,8 @@ func (d *Dispatch) handlePodDeleted(pod *corev1.Pod) { if !ok { return } - log.WithField("instanceID", instanceID).Debugf("POD DELETED: %s", instanceID) + log.WithField("instanceID", instanceID).Debug("pod deleted") + defer log.WithField("instanceID", instanceID).Debug("pod deleted done") d.mu.Lock() defer d.mu.Unlock() @@ -361,5 +356,4 @@ func (d *Dispatch) handlePodDeleted(pod *corev1.Pod) { delete(d.ctxs, instanceID) - log.WithField("instanceID", instanceID).Debugf("POD DELETED DONE: %s", instanceID) } diff --git a/components/ws-daemon/pkg/iws/iws.go b/components/ws-daemon/pkg/iws/iws.go index f444fbfed9c14c..a2916cb909cff6 100644 --- a/components/ws-daemon/pkg/iws/iws.go +++ b/components/ws-daemon/pkg/iws/iws.go @@ -961,9 +961,6 @@ func (wbs *InWorkspaceServiceServer) Teardown(ctx context.Context, req *api.Tear owi := wbs.Session.OWI() log := log.WithFields(owi) - log.Debug("TEARDOWN") - defer log.Debug("TEARDOWN DONE") - var ( success = true err error @@ -981,8 +978,9 @@ func (wbs *InWorkspaceServiceServer) Teardown(ctx context.Context, req *api.Tear // WipingTeardown tears down every state we created using IWS func (wbs *InWorkspaceServiceServer) WipingTeardown(ctx context.Context, req *api.WipingTeardownRequest) (*api.WipingTeardownResponse, error) { log := log.WithFields(wbs.Session.OWI()) - log.WithField("doWipe", req.DoWipe).Debug("WIPING TEARDOWN") - defer log.WithField("doWipe", req.DoWipe).Debug("WIPING TEARDOWN DONE") + log.WithField("doWipe", req.DoWipe).Debug("iws.WipingTeardown") + defer log.WithField("doWipe", req.DoWipe).Debug("iws.WipingTeardown done") + if !req.DoWipe { return &api.WipingTeardownResponse{Success: true}, nil } diff --git a/components/ws-daemon/pkg/netlimit/netlimit.go b/components/ws-daemon/pkg/netlimit/netlimit.go index 3fc26ef79065d0..cfe29ca86b79c4 100644 --- a/components/ws-daemon/pkg/netlimit/netlimit.go +++ b/components/ws-daemon/pkg/netlimit/netlimit.go @@ -6,6 +6,7 @@ package netlimit import ( "context" + "errors" "fmt" "os" "os/exec" @@ -130,7 +131,7 @@ func (c *ConnLimiter) limitWorkspace(ctx context.Context, ws *dispatch.Workspace pid, err := disp.Runtime.ContainerPID(ctx, ws.ContainerID) if err != nil { - if dispatch.IsCancelled(ctx) { + if errors.Is(err, context.Canceled) { return nil } return fmt.Errorf("could not get pid for container %s of workspace %s", ws.ContainerID, ws.WorkspaceID) @@ -144,7 +145,7 @@ func (c *ConnLimiter) limitWorkspace(ctx context.Context, ws *dispatch.Workspace } }, nsinsider.EnterMountNS(false), nsinsider.EnterNetNS(true)) if err != nil { - if dispatch.IsCancelled(ctx) { + if errors.Is(err, context.Canceled) { return nil } log.WithError(err).WithFields(ws.OWI()).Error("cannot enable connection limiting") diff --git a/components/ws-daemon/pkg/nsinsider/nsinsider.go b/components/ws-daemon/pkg/nsinsider/nsinsider.go index 0b7d3473be3bf9..53951490f938e8 100644 --- a/components/ws-daemon/pkg/nsinsider/nsinsider.go +++ b/components/ws-daemon/pkg/nsinsider/nsinsider.go @@ -110,6 +110,7 @@ func Nsinsider(instanceID string, targetPid int, mod func(*exec.Cmd), opts ...ns _, _ = io.Copy(os.Stderr, &cmdErr) log.FromBuffer(&cmdOut, log.WithFields(log.OWI("", "", instanceID))) if err != nil { + // writing stderr to the error so clients can pattern match on specific errors return fmt.Errorf("run nsinsider (%v) failed: %q \\ %q\n%v", cmd.Args, cmdOut.String(), diff --git a/components/ws-manager-mk2/controllers/status.go b/components/ws-manager-mk2/controllers/status.go index 4b12bf981e1987..9af7ae996fb151 100644 --- a/components/ws-manager-mk2/controllers/status.go +++ b/components/ws-manager-mk2/controllers/status.go @@ -72,7 +72,6 @@ func (r *WorkspaceReconciler) updateWorkspaceStatus(ctx context.Context, workspa workspace.Status.Phase = workspacev1.WorkspacePhaseStopped } - log.WithValues("podDeletionTime", workspace.Status.PodDeletionTime).Info("PodDeletionTimeValue") if workspace.Status.Phase == workspacev1.WorkspacePhaseStopped && workspace.Status.PodDeletionTime == nil { // Set the timestamp when we first saw the pod as deleted. // This is used for the delaying eventual pod restarts diff --git a/components/ws-manager-mk2/controllers/workspace_controller.go b/components/ws-manager-mk2/controllers/workspace_controller.go index 68c00f8f5ce67d..791912f00159a5 100644 --- a/components/ws-manager-mk2/controllers/workspace_controller.go +++ b/components/ws-manager-mk2/controllers/workspace_controller.go @@ -224,22 +224,23 @@ func (r *WorkspaceReconciler) actOnStatus(ctx context.Context, workspace *worksp workspace.Status.SetCondition(workspacev1.NewWorkspaceConditionPodRejected(fmt.Sprintf("Pod reached maximum recreations %d, failing", workspace.Status.PodRecreated), metav1.ConditionFalse)) return ctrl.Result{Requeue: true}, nil // requeue so we end up in the "Stopped" case below } - log.WithValues("PodStarts", workspace.Status.PodStarts, "PodRecreated", workspace.Status.PodRecreated, "Phase", workspace.Status.Phase).Info("trigger pod recreation") + log = log.WithValues("PodStarts", workspace.Status.PodStarts, "PodRecreated", workspace.Status.PodRecreated, "Phase", workspace.Status.Phase) // Make sure to wait for "recreationTimeout" before creating the pod again if workspace.Status.PodDeletionTime == nil { - log.Info("want to wait for pod recreation timeout, but podDeletionTime not set (yet)") + log.Info("pod recreation: waiting for pod deletion time to be populated...") return ctrl.Result{Requeue: true, RequeueAfter: 5 * time.Second}, nil } recreationTimeout := r.podRecreationTimeout() podDeletionTime := workspace.Status.PodDeletionTime.Time waitTime := time.Until(podDeletionTime.Add(recreationTimeout)) + log = log.WithValues("waitTime", waitTime.String(), "recreationTimeout", recreationTimeout.String(), "podDeletionTime", podDeletionTime.String()) if waitTime > 0 { - log.WithValues("waitTime", waitTime).Info("waiting for pod recreation timeout") + log.Info("pod recreation: waiting for timeout...") return ctrl.Result{Requeue: true, RequeueAfter: waitTime}, nil } - log.WithValues("waitedTime", waitTime.Abs().String()).Info("waited for pod recreation timeout") + log.Info("trigger pod recreation") // Must persist the modification pod starts, and ensure we retry on conflict. // If we fail to persist this value, it's possible that the Pod gets recreated endlessly