Skip to content

Commit

Permalink
do not allow negative requeue times
Browse files Browse the repository at this point in the history
Use of the value of 0 for the taskrun/pipeline timeout, which per https://github.com/tektoncd/pipeline/blob/main/docs/pipelineruns.md#configuring-a-failure-timeout for example means timeout
is disabled, results in the waitTime passed to the Requeue event to be negative.  This had the observed behavior of Requeue'ing immediately, and intense cycles of many
reconcilations per second were observed if the TaskRun's/PipelineRun's state did not in fact change.  This artificially constrained the peformance of the pipeline controller.

This change makes sure the wait time passed to the Requeue is not negative.
  • Loading branch information
gabemontero committed Jan 19, 2024
1 parent e1c7828 commit 3397aa3
Show file tree
Hide file tree
Showing 4 changed files with 220 additions and 7 deletions.
18 changes: 12 additions & 6 deletions pkg/reconciler/pipelinerun/pipelinerun.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,11 +21,6 @@ import (
"encoding/json"
"errors"
"fmt"
"path/filepath"
"reflect"
"regexp"
"strings"

"github.com/hashicorp/go-multierror"
"github.com/tektoncd/pipeline/pkg/apis/config"
"github.com/tektoncd/pipeline/pkg/apis/pipeline"
Expand Down Expand Up @@ -74,6 +69,10 @@ import (
"knative.dev/pkg/kmeta"
"knative.dev/pkg/logging"
pkgreconciler "knative.dev/pkg/reconciler"
"path/filepath"
"reflect"
"regexp"
"strings"
)

// Aliased for backwards compatibility; do not add additional reasons here
Expand Down Expand Up @@ -273,7 +272,14 @@ func (c *Reconciler) ReconcileKind(ctx context.Context, pr *v1.PipelineRun) pkgr
// Compute the time since the task started.
elapsed := c.Clock.Since(pr.Status.StartTime.Time)
// Snooze this resource until the appropriate timeout has elapsed.
waitTime := pr.PipelineTimeout(ctx) - elapsed
// but if the timeout has been disabled by setting timeout to 0, we
// do not want to subtract from 0, because a negative wait time will
// result in the requeue happening essentially immediately
timeout := pr.PipelineTimeout(ctx)
if timeout == config.NoTimeoutDuration {
timeout = taskrun.DeriveTimeoutForRequeueWaitTime(elapsed)
}
waitTime := timeout - elapsed
if pr.Status.FinallyStartTime == nil && pr.TasksTimeout() != nil {
waitTime = pr.TasksTimeout().Duration - elapsed
} else if pr.Status.FinallyStartTime != nil && pr.FinallyTimeout() != nil {
Expand Down
92 changes: 92 additions & 0 deletions pkg/reconciler/pipelinerun/pipelinerun_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -2486,6 +2486,98 @@ spec:
}
}

func TestReconcileWithTimeoutDisabled(t *testing.T) {
testCases := []struct {
name string
timeout time.Duration
}{
{
name: "pipeline timeout is 24h",
timeout: 24 * time.Hour,
},
{
name: "pipeline timeout is way longer than 24h",
timeout: 360 * time.Hour,
},
}

for _, tc := range testCases {
startTime := time.Date(2022, time.January, 1, 0, 0, 0, 0, time.UTC).Add(-3 * tc.timeout)
t.Run(tc.name, func(t *testing.T) {
ps := []*v1.Pipeline{parse.MustParseV1Pipeline(t, `
metadata:
name: test-pipeline
namespace: foo
spec:
tasks:
- name: hello-world-1
taskRef:
name: hello-world
- name: hello-world-2
taskRef:
name: hello-world
`)}
prs := []*v1.PipelineRun{parse.MustParseV1PipelineRun(t, `
metadata:
name: test-pipeline-run-with-timeout-disabled
namespace: foo
spec:
pipelineRef:
name: test-pipeline
taskRunTemplate:
serviceAccountName: test-sa
timeouts:
pipeline: 0h0m0s
status:
startTime: "2021-12-30T00:00:00Z"
`)}
ts := []*v1.Task{simpleHelloWorldTask}

trs := []*v1.TaskRun{mustParseTaskRunWithObjectMeta(t, taskRunObjectMeta("test-pipeline-run-with-timeout-hello-world-1", "foo", "test-pipeline-run-with-timeout-disabled",
"test-pipeline", "hello-world-1", false), `
spec:
serviceAccountName: test-sa
taskRef:
name: hello-world
kind: Task
`)}
start := metav1.NewTime(startTime)
prs[0].Status.StartTime = &start

d := test.Data{
PipelineRuns: prs,
Pipelines: ps,
Tasks: ts,
TaskRuns: trs,
}
prt := newPipelineRunTest(t, d)
defer prt.Cancel()

c := prt.TestAssets.Controller
clients := prt.TestAssets.Clients
reconcileError := c.Reconciler.Reconcile(prt.TestAssets.Ctx, "foo/test-pipeline-run-with-timeout-disabled")
if reconcileError == nil {
t.Errorf("expected error, but got nil")
}
if isRequeueError, requeueDuration := controller.IsRequeueKey(reconcileError); !isRequeueError {
t.Errorf("Expected requeue error, but got: %s", reconcileError.Error())
} else {
if requeueDuration < 0 {
t.Errorf("Expected a positive requeue duration but got %s", requeueDuration.String())
}
}
prt.Test.Logf("Getting reconciled run")
reconciledRun, err := clients.Pipeline.TektonV1().PipelineRuns("foo").Get(prt.TestAssets.Ctx, "test-pipeline-run-with-timeout-disabled", metav1.GetOptions{})
if err != nil {
prt.Test.Errorf("Somehow had error getting reconciled run out of fake client: %s", err)
}
if reconciledRun.Status.GetCondition(apis.ConditionSucceeded).Reason == "PipelineRunTimeout" {
t.Errorf("Expected PipelineRun to not be timed out, but it is timed out")
}
})
}
}

func TestReconcileWithTimeoutForALongTimeAndEtcdLimit_Pipeline(t *testing.T) {
timeout := 12 * time.Hour
testCases := []struct {
Expand Down
29 changes: 28 additions & 1 deletion pkg/reconciler/taskrun/taskrun.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ import (
"context"
"errors"
"fmt"
"math"
"reflect"
"strings"
"time"
Expand Down Expand Up @@ -211,7 +212,11 @@ func (c *Reconciler) ReconcileKind(ctx context.Context, tr *v1.TaskRun) pkgrecon
// Compute the time since the task started.
elapsed := c.Clock.Since(tr.Status.StartTime.Time)
// Snooze this resource until the timeout has elapsed.
return controller.NewRequeueAfter(tr.GetTimeout(ctx) - elapsed)
timeout := tr.GetTimeout(ctx)
if timeout == config.NoTimeoutDuration {
timeout = DeriveTimeoutForRequeueWaitTime(elapsed)
}
return controller.NewRequeueAfter(timeout - elapsed)
}
return nil
}
Expand Down Expand Up @@ -1056,3 +1061,25 @@ func retryTaskRun(tr *v1.TaskRun, message string) {
taskRunCondSet := apis.NewBatchConditionSet()
taskRunCondSet.Manage(&tr.Status).MarkUnknown(apis.ConditionSucceeded, v1.TaskRunReasonToBeRetried.String(), message)
}

func DeriveTimeoutForRequeueWaitTime(elapsed time.Duration) time.Duration {
timeout := time.Duration(math.MaxInt64)
// an explicit choice has been made to not make these demarcation times configurable
// wrt wait times when timeout has been disabled; rationale: allowing
// disablement of timeout is problematic, but the feature has already been
// exposed. We'll get it to behave (i.e. no negative wait times) but we
// will not add configuration to encourage use of the feature.
switch {
case elapsed < 30*time.Minute:
timeout = 30 * time.Minute
case elapsed < time.Hour:
timeout = time.Hour
case elapsed < 6*time.Hour:
timeout = 6 * time.Hour
case elapsed < 24*time.Hour:
timeout = 24 * time.Hour
case elapsed < 48*time.Hour:
timeout = 48 * time.Hour
}
return timeout
}
88 changes: 88 additions & 0 deletions pkg/reconciler/taskrun/taskrun_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -2777,6 +2777,94 @@ status:
}
}

func TestReconcileWithTimeoutDisabled(t *testing.T) {
type testCase struct {
name string
taskRun *v1.TaskRun
expectRequeue bool
}

testcases := []testCase{
{
name: "taskrun with timeout",
taskRun: parse.MustParseV1TaskRun(t, `
metadata:
name: test-taskrun-timeout
namespace: foo
spec:
taskRef:
name: test-task
timeout: 10m
status:
conditions:
- status: Unknown
type: Succeeded
`),
}, {
name: "taskrun with default timeout",
taskRun: parse.MustParseV1TaskRun(t, `
metadata:
name: test-taskrun-default-timeout-60-minutes
namespace: foo
spec:
taskRef:
name: test-task
status:
conditions:
- status: Unknown
type: Succeeded
`),
}, {
name: "task run with timeout set to 0 to disable",
taskRun: parse.MustParseV1TaskRun(t, `
metadata:
name: test-taskrun-timeout-disabled
namespace: foo
spec:
taskRef:
name: test-task
timeout: 0s
status:
conditions:
- status: Unknown
type: Succeeded
`),
}}

for _, tc := range testcases {
t.Run(tc.name, func(t *testing.T) {
start := metav1.NewTime(time.Now())
tc.taskRun.Status.StartTime = &start
pod, err := makePod(tc.taskRun, simpleTask)
d := test.Data{
TaskRuns: []*v1.TaskRun{tc.taskRun},
Tasks: []*v1.Task{simpleTask},
Pods: []*corev1.Pod{pod},
}
testAssets, cancel := getTaskRunController(t, d)
defer cancel()
c := testAssets.Controller
clients := testAssets.Clients

err = c.Reconciler.Reconcile(testAssets.Ctx, getRunName(tc.taskRun))
if err == nil {
t.Errorf("expected error when reconciling completed TaskRun : %v", err)
}
if isRequeueError, requeueDuration := controller.IsRequeueKey(err); !isRequeueError {
t.Errorf("Expected requeue error, but got: %s", err.Error())
} else {
if requeueDuration < 0 {
t.Errorf("Expected a positive requeue duration but got %s", requeueDuration.String())
}
}
_, err = clients.Pipeline.TektonV1().TaskRuns(tc.taskRun.Namespace).Get(testAssets.Ctx, tc.taskRun.Name, metav1.GetOptions{})
if err != nil {
t.Errorf("Expected completed TaskRun %s to exist but instead got error when getting it: %v", tc.taskRun.Name, err)
}
})
}
}

func TestReconcileTimeouts(t *testing.T) {
type testCase struct {
name string
Expand Down

0 comments on commit 3397aa3

Please sign in to comment.