Skip to content

Commit

Permalink
[ws-daemon] start backup even pod still report the container is runni…
Browse files Browse the repository at this point in the history
…ng after 5 minutes (#20382)
  • Loading branch information
iQQBot authored Nov 22, 2024
1 parent 08d82bc commit b77f687
Show file tree
Hide file tree
Showing 8 changed files with 133 additions and 11 deletions.
2 changes: 1 addition & 1 deletion components/workspacekit/cmd/rings.go
Original file line number Diff line number Diff line change
Expand Up @@ -151,7 +151,7 @@ var ring0Cmd = &cobra.Command{

_ = cmd.Process.Signal(unix.SIGTERM)
time.Sleep(ring1ShutdownTimeout)
if cmd.Process == nil {
if cmd.Process == nil || cmd.ProcessState.Exited() {
return
}

Expand Down
5 changes: 5 additions & 0 deletions components/ws-daemon/pkg/container/container.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ import (

"golang.org/x/xerrors"

"github.com/containerd/containerd/api/types/task"
workspacev1 "github.com/gitpod-io/gitpod/ws-manager/api/crd/v1"
)

Expand Down Expand Up @@ -54,6 +55,10 @@ type Runtime interface {

// DisposeContainer removes a stopped container, and everything we know about it
DisposeContainer(ctx context.Context, workspaceInstanceID string)

GetContainerTaskInfo(ctx context.Context, id ID) (*task.Process, error)

ForceKillContainerTask(ctx context.Context, id ID) error
}

var (
Expand Down
23 changes: 23 additions & 0 deletions components/ws-daemon/pkg/container/containerd.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ import (
"github.com/containerd/containerd/api/events"
"github.com/containerd/containerd/api/services/tasks/v1"
"github.com/containerd/containerd/api/types"
"github.com/containerd/containerd/api/types/task"
"github.com/containerd/containerd/containers"
"github.com/containerd/containerd/errdefs"
"github.com/containerd/containerd/images"
Expand Down Expand Up @@ -576,6 +577,28 @@ func (s *Containerd) IsContainerdReady(ctx context.Context) (bool, error) {
return true, nil
}

func (s *Containerd) GetContainerTaskInfo(ctx context.Context, id ID) (*task.Process, error) {
task, err := s.Client.TaskService().Get(ctx, &tasks.GetRequest{
ContainerID: string(id),
})
if err != nil {
return nil, err
}
if task.Process == nil {
return nil, fmt.Errorf("task has no process")
}
return task.Process, nil
}

func (s *Containerd) ForceKillContainerTask(ctx context.Context, id ID) error {
_, err := s.Client.TaskService().Kill(ctx, &tasks.KillRequest{
ContainerID: string(id),
Signal: 9,
All: true,
})
return err
}

var kubepodsQoSRegexp = regexp.MustCompile(`([^/]+)-([^/]+)-pod`)
var kubepodsRegexp = regexp.MustCompile(`([^/]+)-pod`)

Expand Down
68 changes: 65 additions & 3 deletions components/ws-daemon/pkg/controller/workspace_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ import (
workspacev1 "github.com/gitpod-io/gitpod/ws-manager/api/crd/v1"
"github.com/opentracing/opentracing-go"
"github.com/prometheus/client_golang/prometheus"
"github.com/sirupsen/logrus"

"google.golang.org/protobuf/proto"
corev1 "k8s.io/api/core/v1"
Expand Down Expand Up @@ -349,9 +350,43 @@ func (wsc *WorkspaceController) doWorkspaceContentBackup(ctx context.Context, sp

if ws.IsConditionTrue(workspacev1.WorkspaceConditionContainerRunning) {
// Container is still running, we need to wait for it to stop.
// We should get an event when the condition changes, but requeue
// anyways to make sure we act on it in time.
return ctrl.Result{RequeueAfter: 500 * time.Millisecond}, nil
// We will wait for this situation for up to 5 minutes.
// If the container is still in a running state after that,
// there may be an issue with state synchronization.
// We should start backup anyway to avoid data loss.
if !(ws.Status.PodStoppingTime != nil && time.Since(ws.Status.PodStoppingTime.Time) > 5*time.Minute) {
// We should get an event when the condition changes, but requeue
// anyways to make sure we act on it in time.
return ctrl.Result{RequeueAfter: 500 * time.Millisecond}, nil
}

if !ws.IsConditionTrue(workspacev1.WorkspaceConditionForceKilledTask) {
err = wsc.forceKillContainerTask(ctx, ws)
if err != nil {
glog.WithFields(ws.OWI()).WithField("workspace", req.NamespacedName).Errorf("failed to force kill task: %v", err)
}
err = retry.RetryOnConflict(retryParams, func() error {
if err := wsc.Get(ctx, req.NamespacedName, ws); err != nil {
return err
}
ws.Status.SetCondition(workspacev1.NewWorkspaceConditionForceKilledTask())
return wsc.Client.Status().Update(ctx, ws)
})
if err != nil {
return ctrl.Result{}, fmt.Errorf("failed to set force killed task condition: %w", err)
}
return ctrl.Result{Requeue: true, RequeueAfter: 2 * time.Second}, nil
}

if time.Since(wsk8s.GetCondition(ws.Status.Conditions, string(workspacev1.WorkspaceConditionForceKilledTask)).LastTransitionTime.Time) < 2*time.Second {
return ctrl.Result{Requeue: true, RequeueAfter: 2 * time.Second}, nil
}

glog.WithFields(ws.OWI()).WithField("workspace", req.NamespacedName).Warn("workspace container is still running after 5 minutes of deletion, starting backup anyway")
err = wsc.dumpWorkspaceContainerInfo(ctx, ws)
if err != nil {
glog.WithFields(ws.OWI()).WithField("workspace", req.NamespacedName).Errorf("failed to dump container info: %v", err)
}
}

if wsc.latestWorkspace(ctx, ws) != nil {
Expand Down Expand Up @@ -442,6 +477,33 @@ func (wsc *WorkspaceController) doWorkspaceContentBackup(ctx context.Context, sp
return ctrl.Result{}, nil
}

func (wsc *WorkspaceController) dumpWorkspaceContainerInfo(ctx context.Context, ws *workspacev1.Workspace) error {
id, err := wsc.runtime.WaitForContainer(ctx, ws.Name)
if err != nil {
return fmt.Errorf("failed to wait for container: %w", err)
}
task, err := wsc.runtime.GetContainerTaskInfo(ctx, id)
if err != nil {
return fmt.Errorf("failed to get container task info: %w", err)
}
glog.WithFields(ws.OWI()).WithFields(logrus.Fields{
"containerID": id,
"exitStatus": task.ExitStatus,
"pid": task.Pid,
"exitedAt": task.ExitedAt.AsTime(),
"status": task.Status.String(),
}).Info("container task info")
return nil
}

func (wsc *WorkspaceController) forceKillContainerTask(ctx context.Context, ws *workspacev1.Workspace) error {
id, err := wsc.runtime.WaitForContainer(ctx, ws.Name)
if err != nil {
return fmt.Errorf("failed to wait for container: %w", err)
}
return wsc.runtime.ForceKillContainerTask(ctx, id)
}

func (wsc *WorkspaceController) prepareInitializer(ctx context.Context, ws *workspacev1.Workspace) (*csapi.WorkspaceInitializer, error) {
var init csapi.WorkspaceInitializer
err := proto.Unmarshal(ws.Spec.Initializer, &init)
Expand Down
25 changes: 21 additions & 4 deletions components/ws-manager-api/go/crd/v1/workspace_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -181,11 +181,17 @@ type WorkspaceImageInfo struct {

// WorkspaceStatus defines the observed state of Workspace
type WorkspaceStatus struct {
PodStarts int `json:"podStarts"`
PodRecreated int `json:"podRecreated"`
PodStarts int `json:"podStarts"`

// +kubebuilder:validation:Optional
PodRecreated int `json:"podRecreated"`
// +kubebuilder:validation:Optional
PodDeletionTime *metav1.Time `json:"podDeletionTime,omitempty"`
URL string `json:"url,omitempty" scrub:"redact"`
OwnerToken string `json:"ownerToken,omitempty" scrub:"redact"`
// +kubebuilder:validation:Optional
PodStoppingTime *metav1.Time `json:"podStoppingTime,omitempty"`

URL string `json:"url,omitempty" scrub:"redact"`
OwnerToken string `json:"ownerToken,omitempty" scrub:"redact"`

// +kubebuilder:default=Unknown
Phase WorkspacePhase `json:"phase,omitempty"`
Expand Down Expand Up @@ -285,6 +291,9 @@ const (

// WorkspaceConditionStateWiped is true once all state has successfully been wiped by ws-daemon. This is only set if PodRejected=true, and the rejected workspace has been deleted.
WorkspaceConditionStateWiped WorkspaceCondition = "StateWiped"

// WorkspaceConditionForceKilledTask is true if we send a SIGKILL to the task
WorkspaceConditionForceKilledTask WorkspaceCondition = "ForceKilledTask"
)

func NewWorkspaceConditionDeployed() metav1.Condition {
Expand Down Expand Up @@ -439,6 +448,14 @@ func NewWorkspaceConditionContainerRunning(status metav1.ConditionStatus) metav1
}
}

func NewWorkspaceConditionForceKilledTask() metav1.Condition {
return metav1.Condition{
Type: string(WorkspaceConditionForceKilledTask),
LastTransitionTime: metav1.Now(),
Status: metav1.ConditionTrue,
}
}

// +kubebuilder:validation:Enum:=Unknown;Pending;Imagebuild;Creating;Initializing;Running;Stopping;Stopped
type WorkspacePhase string

Expand Down
8 changes: 8 additions & 0 deletions components/ws-manager-api/go/crd/v1/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Original file line number Diff line number Diff line change
Expand Up @@ -548,11 +548,14 @@ spec:
- Stopping
- Stopped
type: string
podStarts:
type: integer
podDeletionTime:
format: date-time
type: string
podRecreated:
type: integer
podDeletionTime:
podStarts:
type: integer
podStoppingTime:
format: date-time
type: string
runtime:
Expand Down
4 changes: 4 additions & 0 deletions components/ws-manager-mk2/controllers/status.go
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,10 @@ func (r *WorkspaceReconciler) updateWorkspaceStatus(ctx context.Context, workspa
defer func() {
if oldPhase != workspace.Status.Phase {
log.Info("workspace phase updated", "oldPhase", oldPhase, "phase", workspace.Status.Phase)
if workspace.Status.Phase == workspacev1.WorkspacePhaseStopping {
t := metav1.Now()
workspace.Status.PodStoppingTime = &t
}
}
}()

Expand Down

0 comments on commit b77f687

Please sign in to comment.