From c21d3faae094e665e9eaf6fabf69084f21d65edf Mon Sep 17 00:00:00 2001 From: gabemontero Date: Thu, 18 Jan 2024 16:45:31 -0500 Subject: [PATCH] do not allow negative requeue times Use of the value of 0 for the taskrun/pipeline timeout, which per https://github.com/tektoncd/pipeline/blob/main/docs/pipelineruns.md#configuring-a-failure-timeout for example means timeout is disabled, results in the waitTime passed to the Requeue event to be negative. This had the observed behavior of Requeue'ing immediately, and intense cycles of many reconcilations per second were observed if the TaskRun's/PipelineRun's state did not in fact change. This artificially constrained the peformance of the pipeline controller. This change makes sure the wait time passed to the Requeue is not negative. --- pkg/reconciler/pipelinerun/pipelinerun.go | 9 +- .../pipelinerun/pipelinerun_test.go | 90 +++++++++++++++++++ pkg/reconciler/taskrun/taskrun.go | 29 +++++- pkg/reconciler/taskrun/taskrun_test.go | 86 ++++++++++++++++++ 4 files changed, 212 insertions(+), 2 deletions(-) diff --git a/pkg/reconciler/pipelinerun/pipelinerun.go b/pkg/reconciler/pipelinerun/pipelinerun.go index b9a01f558e5..76fc46d240e 100644 --- a/pkg/reconciler/pipelinerun/pipelinerun.go +++ b/pkg/reconciler/pipelinerun/pipelinerun.go @@ -273,7 +273,14 @@ func (c *Reconciler) ReconcileKind(ctx context.Context, pr *v1.PipelineRun) pkgr // Compute the time since the task started. elapsed := c.Clock.Since(pr.Status.StartTime.Time) // Snooze this resource until the appropriate timeout has elapsed. - waitTime := pr.PipelineTimeout(ctx) - elapsed + // but if the timeout has been disabled by setting timeout to 0, we + // do not want to subtract from 0, because a negative wait time will + // result in the requeue happening essentially immediately + timeout := pr.PipelineTimeout(ctx) + if timeout == config.NoTimeoutDuration { + timeout = taskrun.DeriveTimeoutForRequeueWaitTime(elapsed) + } + waitTime := timeout - elapsed if pr.Status.FinallyStartTime == nil && pr.TasksTimeout() != nil { waitTime = pr.TasksTimeout().Duration - elapsed } else if pr.Status.FinallyStartTime != nil && pr.FinallyTimeout() != nil { diff --git a/pkg/reconciler/pipelinerun/pipelinerun_test.go b/pkg/reconciler/pipelinerun/pipelinerun_test.go index 1b100cd1b71..6af32515743 100644 --- a/pkg/reconciler/pipelinerun/pipelinerun_test.go +++ b/pkg/reconciler/pipelinerun/pipelinerun_test.go @@ -2486,6 +2486,96 @@ spec: } } +func TestReconcileWithTimeoutDisabled(t *testing.T) { + testCases := []struct { + name string + timeout time.Duration + }{ + { + name: "pipeline timeout is 24h", + timeout: 24 * time.Hour, + }, + { + name: "pipeline timeout is way longer than 24h", + timeout: 360 * time.Hour, + }, + } + + for _, tc := range testCases { + startTime := time.Date(2022, time.January, 1, 0, 0, 0, 0, time.UTC).Add(-3 * tc.timeout) + t.Run(tc.name, func(t *testing.T) { + ps := []*v1.Pipeline{parse.MustParseV1Pipeline(t, ` +metadata: + name: test-pipeline + namespace: foo +spec: + tasks: + - name: hello-world-1 + taskRef: + name: hello-world + - name: hello-world-2 + taskRef: + name: hello-world +`)} + prs := []*v1.PipelineRun{parse.MustParseV1PipelineRun(t, ` +metadata: + name: test-pipeline-run-with-timeout-disabled + namespace: foo +spec: + pipelineRef: + name: test-pipeline + taskRunTemplate: + serviceAccountName: test-sa + timeouts: + pipeline: 0h0m0s +status: + startTime: "2021-12-30T00:00:00Z" +`)} + ts := []*v1.Task{simpleHelloWorldTask} + + trs := []*v1.TaskRun{mustParseTaskRunWithObjectMeta(t, taskRunObjectMeta("test-pipeline-run-with-timeout-hello-world-1", "foo", "test-pipeline-run-with-timeout-disabled", + "test-pipeline", "hello-world-1", false), ` +spec: + serviceAccountName: test-sa + taskRef: + name: hello-world + kind: Task +`)} + start := metav1.NewTime(startTime) + prs[0].Status.StartTime = &start + + d := test.Data{ + PipelineRuns: prs, + Pipelines: ps, + Tasks: ts, + TaskRuns: trs, + } + prt := newPipelineRunTest(t, d) + defer prt.Cancel() + + c := prt.TestAssets.Controller + clients := prt.TestAssets.Clients + reconcileError := c.Reconciler.Reconcile(prt.TestAssets.Ctx, "foo/test-pipeline-run-with-timeout-disabled") + if reconcileError == nil { + t.Errorf("expected error, but got nil") + } + if isRequeueError, requeueDuration := controller.IsRequeueKey(reconcileError); !isRequeueError { + t.Errorf("Expected requeue error, but got: %s", reconcileError.Error()) + } else if requeueDuration < 0 { + t.Errorf("Expected a positive requeue duration but got %s", requeueDuration.String()) + } + prt.Test.Logf("Getting reconciled run") + reconciledRun, err := clients.Pipeline.TektonV1().PipelineRuns("foo").Get(prt.TestAssets.Ctx, "test-pipeline-run-with-timeout-disabled", metav1.GetOptions{}) + if err != nil { + prt.Test.Errorf("Somehow had error getting reconciled run out of fake client: %s", err) + } + if reconciledRun.Status.GetCondition(apis.ConditionSucceeded).Reason == "PipelineRunTimeout" { + t.Errorf("Expected PipelineRun to not be timed out, but it is timed out") + } + }) + } +} + func TestReconcileWithTimeoutForALongTimeAndEtcdLimit_Pipeline(t *testing.T) { timeout := 12 * time.Hour testCases := []struct { diff --git a/pkg/reconciler/taskrun/taskrun.go b/pkg/reconciler/taskrun/taskrun.go index 09ca1b7a151..4ded7bde48d 100644 --- a/pkg/reconciler/taskrun/taskrun.go +++ b/pkg/reconciler/taskrun/taskrun.go @@ -20,6 +20,7 @@ import ( "context" "errors" "fmt" + "math" "reflect" "strings" "time" @@ -211,7 +212,11 @@ func (c *Reconciler) ReconcileKind(ctx context.Context, tr *v1.TaskRun) pkgrecon // Compute the time since the task started. elapsed := c.Clock.Since(tr.Status.StartTime.Time) // Snooze this resource until the timeout has elapsed. - return controller.NewRequeueAfter(tr.GetTimeout(ctx) - elapsed) + timeout := tr.GetTimeout(ctx) + if timeout == config.NoTimeoutDuration { + timeout = DeriveTimeoutForRequeueWaitTime(elapsed) + } + return controller.NewRequeueAfter(timeout - elapsed) } return nil } @@ -1056,3 +1061,25 @@ func retryTaskRun(tr *v1.TaskRun, message string) { taskRunCondSet := apis.NewBatchConditionSet() taskRunCondSet.Manage(&tr.Status).MarkUnknown(apis.ConditionSucceeded, v1.TaskRunReasonToBeRetried.String(), message) } + +func DeriveTimeoutForRequeueWaitTime(elapsed time.Duration) time.Duration { + timeout := time.Duration(math.MaxInt64) + // an explicit choice has been made to not make these demarcation times configurable + // wrt wait times when timeout has been disabled; rationale: allowing + // disablement of timeout is problematic, but the feature has already been + // exposed. We'll get it to behave (i.e. no negative wait times) but we + // will not add configuration to encourage use of the feature. + switch { + case elapsed < 30*time.Minute: + timeout = 30 * time.Minute + case elapsed < time.Hour: + timeout = time.Hour + case elapsed < 6*time.Hour: + timeout = 6 * time.Hour + case elapsed < 24*time.Hour: + timeout = 24 * time.Hour + case elapsed < 48*time.Hour: + timeout = 48 * time.Hour + } + return timeout +} diff --git a/pkg/reconciler/taskrun/taskrun_test.go b/pkg/reconciler/taskrun/taskrun_test.go index c681f8ae04b..0f36e7fddb2 100644 --- a/pkg/reconciler/taskrun/taskrun_test.go +++ b/pkg/reconciler/taskrun/taskrun_test.go @@ -2777,6 +2777,92 @@ status: } } +func TestReconcileWithTimeoutDisabled(t *testing.T) { + type testCase struct { + name string + taskRun *v1.TaskRun + } + + testcases := []testCase{ + { + name: "taskrun with timeout", + taskRun: parse.MustParseV1TaskRun(t, ` +metadata: + name: test-taskrun-timeout + namespace: foo +spec: + taskRef: + name: test-task + timeout: 10m +status: + conditions: + - status: Unknown + type: Succeeded +`), + }, { + name: "taskrun with default timeout", + taskRun: parse.MustParseV1TaskRun(t, ` +metadata: + name: test-taskrun-default-timeout-60-minutes + namespace: foo +spec: + taskRef: + name: test-task +status: + conditions: + - status: Unknown + type: Succeeded +`), + }, { + name: "task run with timeout set to 0 to disable", + taskRun: parse.MustParseV1TaskRun(t, ` +metadata: + name: test-taskrun-timeout-disabled + namespace: foo +spec: + taskRef: + name: test-task + timeout: 0s +status: + conditions: + - status: Unknown + type: Succeeded +`), + }} + + for _, tc := range testcases { + t.Run(tc.name, func(t *testing.T) { + start := metav1.NewTime(time.Now()) + tc.taskRun.Status.StartTime = &start + pod, err := makePod(tc.taskRun, simpleTask) + d := test.Data{ + TaskRuns: []*v1.TaskRun{tc.taskRun}, + Tasks: []*v1.Task{simpleTask}, + Pods: []*corev1.Pod{pod}, + } + testAssets, cancel := getTaskRunController(t, d) + defer cancel() + c := testAssets.Controller + clients := testAssets.Clients + + err = c.Reconciler.Reconcile(testAssets.Ctx, getRunName(tc.taskRun)) + if err == nil { + t.Errorf("expected error when reconciling completed TaskRun : %v", err) + } + if isRequeueError, requeueDuration := controller.IsRequeueKey(err); !isRequeueError { + t.Errorf("Expected requeue error, but got: %s", err.Error()) + } else if requeueDuration < 0 { + t.Errorf("Expected a positive requeue duration but got %s", requeueDuration.String()) + + } + _, err = clients.Pipeline.TektonV1().TaskRuns(tc.taskRun.Namespace).Get(testAssets.Ctx, tc.taskRun.Name, metav1.GetOptions{}) + if err != nil { + t.Errorf("Expected completed TaskRun %s to exist but instead got error when getting it: %v", tc.taskRun.Name, err) + } + }) + } +} + func TestReconcileTimeouts(t *testing.T) { type testCase struct { name string