From d74a210231f94922497da67655fc13a1668ca84e Mon Sep 17 00:00:00 2001 From: Eli Wrenn Date: Thu, 21 Jan 2021 18:56:10 -0800 Subject: [PATCH] Add exponential backoff when apps are failing that has a ceiling of syncPeriod --- pkg/apis/kappctrl/v1alpha1/types.go | 3 + .../v1alpha1/zz_generated.deepcopy.go | 81 ++++++++++- pkg/app/app_reconcile.go | 72 +--------- pkg/app/reconcile_timer.go | 86 +++++++++++ pkg/app/reconcile_timer_test.go | 135 ++++++++++++++++++ test/e2e/git_test.go | 10 +- test/e2e/helm_test.go | 5 +- test/e2e/http_test.go | 10 +- 8 files changed, 325 insertions(+), 77 deletions(-) create mode 100644 pkg/app/reconcile_timer.go create mode 100644 pkg/app/reconcile_timer_test.go diff --git a/pkg/apis/kappctrl/v1alpha1/types.go b/pkg/apis/kappctrl/v1alpha1/types.go index 488e44aa6..dac388657 100644 --- a/pkg/apis/kappctrl/v1alpha1/types.go +++ b/pkg/apis/kappctrl/v1alpha1/types.go @@ -68,6 +68,9 @@ type AppStatus struct { Deploy *AppStatusDeploy `json:"deploy,omitempty"` Inspect *AppStatusInspect `json:"inspect,omitempty"` + ConsecutiveReconcileSuccesses int `json:"consecutiveReconcileSuccesses,omitempty"` + ConsecutiveReconcileFailures int `json:"consecutiveReconcileFailures,omitempty"` + ObservedGeneration int64 `json:"observedGeneration"` Conditions []AppCondition `json:"conditions"` diff --git a/pkg/apis/kappctrl/v1alpha1/zz_generated.deepcopy.go b/pkg/apis/kappctrl/v1alpha1/zz_generated.deepcopy.go index 16eb06399..326474af7 100644 --- a/pkg/apis/kappctrl/v1alpha1/zz_generated.deepcopy.go +++ b/pkg/apis/kappctrl/v1alpha1/zz_generated.deepcopy.go @@ -5,6 +5,7 @@ package v1alpha1 import ( + v1 "k8s.io/apimachinery/pkg/apis/meta/v1" runtime "k8s.io/apimachinery/pkg/runtime" ) @@ -486,6 +487,11 @@ func (in *AppSpec) DeepCopyInto(out *AppSpec) { (*in)[i].DeepCopyInto(&(*out)[i]) } } + if in.SyncPeriod != nil { + in, out := &in.SyncPeriod, &out.SyncPeriod + *out = new(v1.Duration) + **out = **in + } return } @@ -621,7 +627,7 @@ func (in *AppTemplate) DeepCopyInto(out *AppTemplate) { if in.Kbld != nil { in, out := &in.Kbld, &out.Kbld *out = new(AppTemplateKbld) - **out = **in + (*in).DeepCopyInto(*out) } if in.HelmTemplate != nil { in, out := &in.HelmTemplate, &out.HelmTemplate @@ -638,6 +644,11 @@ func (in *AppTemplate) DeepCopyInto(out *AppTemplate) { *out = new(AppTemplateJsonnet) **out = **in } + if in.Sops != nil { + in, out := &in.Sops, &out.Sops + *out = new(AppTemplateSops) + (*in).DeepCopyInto(*out) + } return } @@ -736,6 +747,11 @@ func (in *AppTemplateJsonnet) DeepCopy() *AppTemplateJsonnet { // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *AppTemplateKbld) DeepCopyInto(out *AppTemplateKbld) { *out = *in + if in.Paths != nil { + in, out := &in.Paths, &out.Paths + *out = make([]string, len(*in)) + copy(*out, *in) + } return } @@ -765,6 +781,69 @@ func (in *AppTemplateKustomize) DeepCopy() *AppTemplateKustomize { return out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *AppTemplateSops) DeepCopyInto(out *AppTemplateSops) { + *out = *in + if in.PGP != nil { + in, out := &in.PGP, &out.PGP + *out = new(AppTemplateSopsPGP) + (*in).DeepCopyInto(*out) + } + if in.Paths != nil { + in, out := &in.Paths, &out.Paths + *out = make([]string, len(*in)) + copy(*out, *in) + } + return +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new AppTemplateSops. +func (in *AppTemplateSops) DeepCopy() *AppTemplateSops { + if in == nil { + return nil + } + out := new(AppTemplateSops) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *AppTemplateSopsPGP) DeepCopyInto(out *AppTemplateSopsPGP) { + *out = *in + if in.PrivateKeysSecretRef != nil { + in, out := &in.PrivateKeysSecretRef, &out.PrivateKeysSecretRef + *out = new(AppTemplateSopsPGPPrivateKeysSecretRef) + **out = **in + } + return +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new AppTemplateSopsPGP. +func (in *AppTemplateSopsPGP) DeepCopy() *AppTemplateSopsPGP { + if in == nil { + return nil + } + out := new(AppTemplateSopsPGP) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *AppTemplateSopsPGPPrivateKeysSecretRef) DeepCopyInto(out *AppTemplateSopsPGPPrivateKeysSecretRef) { + *out = *in + return +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new AppTemplateSopsPGPPrivateKeysSecretRef. +func (in *AppTemplateSopsPGPPrivateKeysSecretRef) DeepCopy() *AppTemplateSopsPGPPrivateKeysSecretRef { + if in == nil { + return nil + } + out := new(AppTemplateSopsPGPPrivateKeysSecretRef) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *AppTemplateYtt) DeepCopyInto(out *AppTemplateYtt) { *out = *in diff --git a/pkg/app/app_reconcile.go b/pkg/app/app_reconcile.go index 9c1799e34..5f8f17937 100644 --- a/pkg/app/app_reconcile.go +++ b/pkg/app/app_reconcile.go @@ -12,7 +12,6 @@ import ( "github.com/vmware-tanzu/carvel-kapp-controller/pkg/memdir" corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/apimachinery/pkg/util/wait" "sigs.k8s.io/controller-runtime/pkg/reconcile" ) @@ -37,7 +36,7 @@ func (a *App) Reconcile() (reconcile.Result, error) { err = a.reconcileDelete() - case a.shouldReconcile(time.Now()): + case NewReconcileTimer(a.app).IsReadyAt(time.Now()): a.log.Info("Started deploy") defer func() { a.log.Info("Completed deploy") }() @@ -47,7 +46,7 @@ func (a *App) Reconcile() (reconcile.Result, error) { a.log.Info("Reconcile noop") } - return a.requeueIfNecessary(), err + return reconcile.Result{RequeueAfter: NewReconcileTimer(a.app).DurationUntilReady(err)}, err } func (a *App) reconcileDelete() error { @@ -221,6 +220,8 @@ func (a *App) setReconcileCompleted(result exec.CmdRunResult) { Status: corev1.ConditionTrue, Message: result.ErrorStr(), }) + a.app.Status.ConsecutiveReconcileFailures++ + a.app.Status.ConsecutiveReconcileSuccesses = 0 a.app.Status.FriendlyDescription = fmt.Sprintf("Reconcile failed: %s", result.ErrorStr()) } else { a.app.Status.Conditions = append(a.app.Status.Conditions, v1alpha1.AppCondition{ @@ -228,6 +229,8 @@ func (a *App) setReconcileCompleted(result exec.CmdRunResult) { Status: corev1.ConditionTrue, Message: "", }) + a.app.Status.ConsecutiveReconcileSuccesses++ + a.app.Status.ConsecutiveReconcileFailures = 0 a.app.Status.FriendlyDescription = "Reconcile succeeded" } } @@ -258,69 +261,6 @@ func (a *App) setDeleteCompleted(result exec.CmdRunResult) { } } -func (a *App) syncPeriod() time.Duration { - const DefaultSyncPeriod = 30 * time.Second - if sp := a.app.Spec.SyncPeriod; sp != nil && sp.Duration > DefaultSyncPeriod { - return sp.Duration - } - return DefaultSyncPeriod -} - -func (a *App) requeueIfNecessary() reconcile.Result { - var ( - shortDelay = 4 * time.Second - // Must always be >= tooLongAfterSuccess so that we dont requeue - // without work to do - // replace last 5 seconds with int from range [5,10] - longerDelay = a.syncPeriod() - 5 + wait.Jitter(5*time.Second, 1.0) - ) - - if a.shouldReconcile(time.Now().Add(shortDelay)) { - return reconcile.Result{RequeueAfter: shortDelay} - } - return reconcile.Result{RequeueAfter: longerDelay} -} - -func (a *App) shouldReconcile(timeAt time.Time) bool { - const ( - tooLongAfterFailure = 3 * time.Second - ) - tooLongAfterSuccess := a.syncPeriod() - - // Did resource spec change? - if a.app.Status.ObservedGeneration != a.app.Generation { - return true - } - - // If canceled/paused, then no reconcilation until unpaused - if a.app.Spec.Canceled || a.app.Spec.Paused { - return false - } - - // Did we deploy at least once? - lastDeploy := a.app.Status.Deploy - if lastDeploy == nil { - return true - } - - // Did previous deploy fail? - for _, cond := range a.app.Status.Conditions { - if cond.Type == v1alpha1.ReconcileFailed { - // Did we try too long ago? - if timeAt.UTC().Sub(lastDeploy.UpdatedAt.Time) > tooLongAfterFailure { - return true - } - } - } - - // Did we deploy too long ago? - if timeAt.UTC().Sub(lastDeploy.UpdatedAt.Time) > tooLongAfterSuccess { - return true - } - - return false -} - func (a *App) removeAllConditions() { a.app.Status.Conditions = nil } diff --git a/pkg/app/reconcile_timer.go b/pkg/app/reconcile_timer.go new file mode 100644 index 000000000..4699ba3c9 --- /dev/null +++ b/pkg/app/reconcile_timer.go @@ -0,0 +1,86 @@ +package app + +import ( + "math" + "time" + + "github.com/vmware-tanzu/carvel-kapp-controller/pkg/apis/kappctrl/v1alpha1" + "k8s.io/apimachinery/pkg/util/wait" +) + +type ReconcileTimer struct { + app v1alpha1.App +} + +func NewReconcileTimer(app v1alpha1.App) ReconcileTimer { + return ReconcileTimer{*app.DeepCopy()} +} + +func (rt ReconcileTimer) DurationUntilReady(err error) time.Duration { + if err != nil || rt.hasReconcileStatus(v1alpha1.ReconcileFailed) { + return rt.failureSyncPeriod() + } + + return rt.applyJitter(rt.syncPeriod()) +} + +func (rt ReconcileTimer) IsReadyAt(timeAt time.Time) bool { + // Did resource spec change? + if rt.app.Status.ObservedGeneration != rt.app.Generation { + return true + } + + // If canceled/paused, then no reconcilation until unpaused + if rt.app.Spec.Canceled || rt.app.Spec.Paused { + return false + } + + // Did we deploy at least once? + lastFetch := rt.app.Status.Fetch + if lastFetch == nil { + return true + } + + if rt.hasReconcileStatus(v1alpha1.ReconcileFailed) { + if timeAt.UTC().Sub(lastFetch.UpdatedAt.Time) >= rt.failureSyncPeriod() { + return true + } + } + + // Did we deploy too long ago? + if timeAt.UTC().Sub(lastFetch.UpdatedAt.Time) >= rt.syncPeriod() { + return true + } + + return false +} + +func (rt ReconcileTimer) syncPeriod() time.Duration { + const defaultSyncPeriod = 30 * time.Second + if sp := rt.app.Spec.SyncPeriod; sp != nil && sp.Duration > defaultSyncPeriod { + return sp.Duration + } + return defaultSyncPeriod +} + +func (rt ReconcileTimer) failureSyncPeriod() time.Duration { + d := time.Duration(math.Exp2(float64(rt.app.Status.ConsecutiveReconcileFailures))) * time.Second + if d < rt.syncPeriod() { + return d + } + return rt.syncPeriod() +} + +func (rt ReconcileTimer) hasReconcileStatus(c v1alpha1.AppConditionType) bool { + for _, cond := range rt.app.Status.Conditions { + if cond.Type == c { + return true + } + } + return false +} + +func (rt ReconcileTimer) applyJitter(t time.Duration) time.Duration { + const appJitter time.Duration = 5 * time.Second + return t - appJitter + wait.Jitter(appJitter, 1.0) +} diff --git a/pkg/app/reconcile_timer_test.go b/pkg/app/reconcile_timer_test.go new file mode 100644 index 000000000..b0b73f125 --- /dev/null +++ b/pkg/app/reconcile_timer_test.go @@ -0,0 +1,135 @@ +package app + +import ( + "testing" + "time" + + "github.com/vmware-tanzu/carvel-kapp-controller/pkg/apis/kappctrl/v1alpha1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" +) + +func TestSucceededDurationUntilReady(t *testing.T) { + syncPeriod := 1 * time.Minute + app := v1alpha1.App{ + Spec: v1alpha1.AppSpec{ + SyncPeriod: &metav1.Duration{Duration: syncPeriod}, + }, + Status: v1alpha1.AppStatus{ + Conditions: []v1alpha1.AppCondition{{Type: v1alpha1.ReconcileSucceeded}}, + }, + } + + for i := 0; i < 100; i++ { + durationUntilReady := NewReconcileTimer(app).DurationUntilReady(nil) + if durationUntilReady < syncPeriod || durationUntilReady > (syncPeriod+10*time.Second) { + t.Fatalf("Expected duration until next reconcile to be in [syncPeriod, syncPeriod + 10]") + } + } +} + +func TestFailedDurationUntilReady(t *testing.T) { + syncPeriod := 30 * time.Second + app := v1alpha1.App{ + Spec: v1alpha1.AppSpec{ + SyncPeriod: &metav1.Duration{Duration: syncPeriod}, + }, + Status: v1alpha1.AppStatus{ + Conditions: []v1alpha1.AppCondition{{Type: v1alpha1.ReconcileFailed}}, + }, + } + + type measurement struct { + NumberOfFailedReconciles int + ExpectedDuration time.Duration + } + + measurements := []measurement{ + {NumberOfFailedReconciles: 1, ExpectedDuration: 2 * time.Second}, + {NumberOfFailedReconciles: 2, ExpectedDuration: 4 * time.Second}, + {NumberOfFailedReconciles: 3, ExpectedDuration: 8 * time.Second}, + {NumberOfFailedReconciles: 4, ExpectedDuration: 16 * time.Second}, + {NumberOfFailedReconciles: 5, ExpectedDuration: 30 * time.Second}, + {NumberOfFailedReconciles: 6, ExpectedDuration: 30 * time.Second}, + } + + for _, m := range measurements { + app.Status.ConsecutiveReconcileFailures = m.NumberOfFailedReconciles + + durationUntilReady := NewReconcileTimer(app).DurationUntilReady(nil) + if durationUntilReady != m.ExpectedDuration { + t.Fatalf( + "Expected app with %d failure(s) to have duration %d but got %d", + m.NumberOfFailedReconciles, + m.ExpectedDuration, + durationUntilReady, + ) + } + } +} + +func TestSucceededIsReadyAt(t *testing.T) { + syncPeriod := 30 * time.Second + timeNow := time.Now() + timeOfReady := timeNow.Add(syncPeriod) + + app := v1alpha1.App{ + Spec: v1alpha1.AppSpec{ + SyncPeriod: &metav1.Duration{Duration: syncPeriod}, + }, + Status: v1alpha1.AppStatus{ + Fetch: &v1alpha1.AppStatusFetch{ + UpdatedAt: metav1.Time{Time: timeNow}, + }, + Conditions: []v1alpha1.AppCondition{{Type: v1alpha1.ReconcileSucceeded}}, + }, + } + + isReady := NewReconcileTimer(app).IsReadyAt(timeOfReady) + if !isReady { + t.Fatalf("Expected app to be ready after syncPeriod of 30s") + } + + isReady = NewReconcileTimer(app).IsReadyAt(timeOfReady.Add(1 * time.Second)) + if !isReady { + t.Fatalf("Expected app to be ready after exceeding syncPeriod of 30s") + } + + isReady = NewReconcileTimer(app).IsReadyAt(timeOfReady.Add(-1 * time.Second)) + if isReady { + t.Fatalf("Expected app to not be ready under syncPeriod of 30s") + } +} + +func TestFailedIsReadyAt(t *testing.T) { + syncPeriod := 2 * time.Second + timeNow := time.Now() + timeOfReady := timeNow.Add(syncPeriod) + + app := v1alpha1.App{ + Spec: v1alpha1.AppSpec{ + SyncPeriod: &metav1.Duration{Duration: syncPeriod}, + }, + Status: v1alpha1.AppStatus{ + Fetch: &v1alpha1.AppStatusFetch{ + UpdatedAt: metav1.Time{Time: timeNow}, + }, + Conditions: []v1alpha1.AppCondition{{Type: v1alpha1.ReconcileFailed}}, + ConsecutiveReconcileFailures: 1, + }, + } + + isReady := NewReconcileTimer(app).IsReadyAt(timeOfReady) + if !isReady { + t.Fatalf("Expected app to be ready after syncPeriod of 2s") + } + + isReady = NewReconcileTimer(app).IsReadyAt(timeOfReady.Add(1 * time.Second)) + if !isReady { + t.Fatalf("Expected app to be ready after exceeding syncPeriod of 2s") + } + + isReady = NewReconcileTimer(app).IsReadyAt(timeOfReady.Add(-1 * time.Second)) + if isReady { + t.Fatalf("Expected app to not be ready under syncPeriod of 2s") + } +} diff --git a/test/e2e/git_test.go b/test/e2e/git_test.go index 5f2e96ced..1ab31c745 100644 --- a/test/e2e/git_test.go +++ b/test/e2e/git_test.go @@ -82,8 +82,9 @@ spec: Template: &v1alpha1.AppStatusTemplate{ ExitCode: 0, }, - ObservedGeneration: 1, - FriendlyDescription: "Reconcile succeeded", + ConsecutiveReconcileSuccesses: 1, + ObservedGeneration: 1, + FriendlyDescription: "Reconcile succeeded", } { @@ -195,8 +196,9 @@ spec: Template: &v1alpha1.AppStatusTemplate{ ExitCode: 0, }, - ObservedGeneration: 1, - FriendlyDescription: "Reconcile succeeded", + ConsecutiveReconcileSuccesses: 1, + ObservedGeneration: 1, + FriendlyDescription: "Reconcile succeeded", } { diff --git a/test/e2e/helm_test.go b/test/e2e/helm_test.go index ee8200dc7..b81193d7d 100644 --- a/test/e2e/helm_test.go +++ b/test/e2e/helm_test.go @@ -95,8 +95,9 @@ stringData: Template: &v1alpha1.AppStatusTemplate{ ExitCode: 0, }, - ObservedGeneration: 1, - FriendlyDescription: "Reconcile succeeded", + ConsecutiveReconcileSuccesses: 1, + ObservedGeneration: 1, + FriendlyDescription: "Reconcile succeeded", } { diff --git a/test/e2e/http_test.go b/test/e2e/http_test.go index e47ba3d22..aa1e9b8fa 100644 --- a/test/e2e/http_test.go +++ b/test/e2e/http_test.go @@ -80,8 +80,9 @@ spec: Template: &v1alpha1.AppStatusTemplate{ ExitCode: 0, }, - ObservedGeneration: 1, - FriendlyDescription: "Reconcile succeeded", + ConsecutiveReconcileSuccesses: 1, + ObservedGeneration: 1, + FriendlyDescription: "Reconcile succeeded", } { @@ -193,8 +194,9 @@ spec: Template: &v1alpha1.AppStatusTemplate{ ExitCode: 0, }, - ObservedGeneration: 1, - FriendlyDescription: "Reconcile succeeded", + ConsecutiveReconcileSuccesses: 1, + ObservedGeneration: 1, + FriendlyDescription: "Reconcile succeeded", } {