Skip to content

Commit

Permalink
do not allow negative requeue times
Browse files Browse the repository at this point in the history
Use of the value of 0 for the taskrun/pipeline timeout, which per https://github.com/tektoncd/pipeline/blob/main/docs/pipelineruns.md#configuring-a-failure-timeout for example means timeout
is disabled, results in the waitTime passed to the Requeue event to be negative.  This had the observed behavior of Requeue'ing immediately, and intense cycles of many
reconcilations per second were observed if the TaskRun's/PipelineRun's state did not in fact change.  This artificially constrained the peformance of the pipeline controller.

This change makes sure the wait time passed to the Requeue is not negative.

rh-pre-commit.version: 2.1.0
rh-pre-commit.check-secrets: ENABLED
  • Loading branch information
gabemontero authored and tekton-robot committed Feb 6, 2024
1 parent ec051b2 commit 856fb4f
Show file tree
Hide file tree
Showing 6 changed files with 199 additions and 3 deletions.
2 changes: 2 additions & 0 deletions docs/pipelineruns.md
Original file line number Diff line number Diff line change
Expand Up @@ -1424,6 +1424,8 @@ If you set the timeout to 0, the `PipelineRun` fails immediately upon encounteri

> :warning: ** `timeout` is deprecated and will be removed in future versions. Consider using `timeouts` instead.

> :note: An internal detail of the `PipelineRun` and `TaskRun` reconcilers in the Tekton controller is that it will requeue a `PipelineRun` or `TaskRun` for re-evaluation, versus waiting for the next update, under certain conditions. The wait time for that re-queueing is the elapsed time subtracted from the timeout; however, if the timeout is set to '0', that calculation produces a negative number, and the new reconciliation event will fire immediately, which can impact overall performance, which is counter to the intent of wait time calculation. So instead, the reconcilers will use the configured global timeout as the wait time when the associated timeout has been set to '0'.

## `PipelineRun` status

### The `status` field
Expand Down
2 changes: 2 additions & 0 deletions docs/taskruns.md
Original file line number Diff line number Diff line change
Expand Up @@ -771,6 +771,8 @@ a different global default timeout value using the `default-timeout-minutes` fie
all `TaskRuns` that do not have a timeout set will have no timeout and will run until it completes successfully
or fails from an error.

> :note: An internal detail of the `PipelineRun` and `TaskRun` reconcilers in the Tekton controller is that it will requeue a `PipelineRun` or `TaskRun` for re-evaluation, versus waiting for the next update, under certain conditions. The wait time for that re-queueing is the elapsed time subtracted from the timeout; however, if the timeout is set to '0', that calculation produces a negative number, and the new reconciliation event will fire immediately, which can impact overall performance, which is counter to the intent of wait time calculation. So instead, the reconcilers will use the configured global timeout as the wait time when the associated timeout has been set to '0'.

### Specifying `ServiceAccount` credentials

You can execute the `Task` in your `TaskRun` with a specific set of credentials by
Expand Down
16 changes: 14 additions & 2 deletions pkg/reconciler/pipelinerun/pipelinerun.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ import (
"reflect"
"regexp"
"strings"
"time"

"github.com/hashicorp/go-multierror"
"github.com/tektoncd/pipeline/pkg/apis/config"
Expand Down Expand Up @@ -273,9 +274,20 @@ func (c *Reconciler) ReconcileKind(ctx context.Context, pr *v1.PipelineRun) pkgr
// Compute the time since the task started.
elapsed := c.Clock.Since(pr.Status.StartTime.Time)
// Snooze this resource until the appropriate timeout has elapsed.
waitTime := pr.PipelineTimeout(ctx) - elapsed
if pr.Status.FinallyStartTime == nil && pr.TasksTimeout() != nil {
// but if the timeout has been disabled by setting timeout to 0, we
// do not want to subtract from 0, because a negative wait time will
// result in the requeue happening essentially immediately
timeout := pr.PipelineTimeout(ctx)
taskTimeout := pr.TasksTimeout()
waitTime := timeout - elapsed
if timeout == config.NoTimeoutDuration {
waitTime = time.Duration(config.FromContextOrDefaults(ctx).Defaults.DefaultTimeoutMinutes) * time.Minute
}
if pr.Status.FinallyStartTime == nil && taskTimeout != nil {
waitTime = pr.TasksTimeout().Duration - elapsed
if taskTimeout.Duration == config.NoTimeoutDuration {
waitTime = time.Duration(config.FromContextOrDefaults(ctx).Defaults.DefaultTimeoutMinutes) * time.Minute
}
} else if pr.Status.FinallyStartTime != nil && pr.FinallyTimeout() != nil {
finallyWaitTime := pr.FinallyTimeout().Duration - c.Clock.Since(pr.Status.FinallyStartTime.Time)
if finallyWaitTime < waitTime {
Expand Down
90 changes: 90 additions & 0 deletions pkg/reconciler/pipelinerun/pipelinerun_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -2486,6 +2486,96 @@ spec:
}
}

func TestReconcileWithTimeoutDisabled(t *testing.T) {
testCases := []struct {
name string
timeout time.Duration
}{
{
name: "pipeline timeout is 24h",
timeout: 24 * time.Hour,
},
{
name: "pipeline timeout is way longer than 24h",
timeout: 360 * time.Hour,
},
}

for _, tc := range testCases {
startTime := time.Date(2022, time.January, 1, 0, 0, 0, 0, time.UTC).Add(-3 * tc.timeout)
t.Run(tc.name, func(t *testing.T) {
ps := []*v1.Pipeline{parse.MustParseV1Pipeline(t, `
metadata:
name: test-pipeline
namespace: foo
spec:
tasks:
- name: hello-world-1
taskRef:
name: hello-world
- name: hello-world-2
taskRef:
name: hello-world
`)}
prs := []*v1.PipelineRun{parse.MustParseV1PipelineRun(t, `
metadata:
name: test-pipeline-run-with-timeout-disabled
namespace: foo
spec:
pipelineRef:
name: test-pipeline
taskRunTemplate:
serviceAccountName: test-sa
timeouts:
pipeline: 0h0m0s
status:
startTime: "2021-12-30T00:00:00Z"
`)}
ts := []*v1.Task{simpleHelloWorldTask}

trs := []*v1.TaskRun{mustParseTaskRunWithObjectMeta(t, taskRunObjectMeta("test-pipeline-run-with-timeout-hello-world-1", "foo", "test-pipeline-run-with-timeout-disabled",
"test-pipeline", "hello-world-1", false), `
spec:
serviceAccountName: test-sa
taskRef:
name: hello-world
kind: Task
`)}
start := metav1.NewTime(startTime)
prs[0].Status.StartTime = &start

d := test.Data{
PipelineRuns: prs,
Pipelines: ps,
Tasks: ts,
TaskRuns: trs,
}
prt := newPipelineRunTest(t, d)
defer prt.Cancel()

c := prt.TestAssets.Controller
clients := prt.TestAssets.Clients
reconcileError := c.Reconciler.Reconcile(prt.TestAssets.Ctx, "foo/test-pipeline-run-with-timeout-disabled")
if reconcileError == nil {
t.Errorf("expected error, but got nil")
}
if isRequeueError, requeueDuration := controller.IsRequeueKey(reconcileError); !isRequeueError {
t.Errorf("Expected requeue error, but got: %s", reconcileError.Error())
} else if requeueDuration < 0 {
t.Errorf("Expected a positive requeue duration but got %s", requeueDuration.String())
}
prt.Test.Logf("Getting reconciled run")
reconciledRun, err := clients.Pipeline.TektonV1().PipelineRuns("foo").Get(prt.TestAssets.Ctx, "test-pipeline-run-with-timeout-disabled", metav1.GetOptions{})
if err != nil {
prt.Test.Errorf("Somehow had error getting reconciled run out of fake client: %s", err)
}
if reconciledRun.Status.GetCondition(apis.ConditionSucceeded).Reason == "PipelineRunTimeout" {
t.Errorf("Expected PipelineRun to not be timed out, but it is timed out")
}
})
}
}

func TestReconcileWithTimeoutForALongTimeAndEtcdLimit_Pipeline(t *testing.T) {
timeout := 12 * time.Hour
testCases := []struct {
Expand Down
7 changes: 6 additions & 1 deletion pkg/reconciler/taskrun/taskrun.go
Original file line number Diff line number Diff line change
Expand Up @@ -212,7 +212,12 @@ func (c *Reconciler) ReconcileKind(ctx context.Context, tr *v1.TaskRun) pkgrecon
// Compute the time since the task started.
elapsed := c.Clock.Since(tr.Status.StartTime.Time)
// Snooze this resource until the timeout has elapsed.
return controller.NewRequeueAfter(tr.GetTimeout(ctx) - elapsed)
timeout := tr.GetTimeout(ctx)
waitTime := timeout - elapsed
if timeout == config.NoTimeoutDuration {
waitTime = time.Duration(config.FromContextOrDefaults(ctx).Defaults.DefaultTimeoutMinutes) * time.Minute
}
return controller.NewRequeueAfter(waitTime)
}
return nil
}
Expand Down
85 changes: 85 additions & 0 deletions pkg/reconciler/taskrun/taskrun_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -2778,6 +2778,91 @@ status:
}
}

func TestReconcileWithTimeoutDisabled(t *testing.T) {
type testCase struct {
name string
taskRun *v1.TaskRun
}

testcases := []testCase{
{
name: "taskrun with timeout",
taskRun: parse.MustParseV1TaskRun(t, `
metadata:
name: test-taskrun-timeout
namespace: foo
spec:
taskRef:
name: test-task
timeout: 10m
status:
conditions:
- status: Unknown
type: Succeeded
`),
}, {
name: "taskrun with default timeout",
taskRun: parse.MustParseV1TaskRun(t, `
metadata:
name: test-taskrun-default-timeout-60-minutes
namespace: foo
spec:
taskRef:
name: test-task
status:
conditions:
- status: Unknown
type: Succeeded
`),
}, {
name: "task run with timeout set to 0 to disable",
taskRun: parse.MustParseV1TaskRun(t, `
metadata:
name: test-taskrun-timeout-disabled
namespace: foo
spec:
taskRef:
name: test-task
timeout: 0s
status:
conditions:
- status: Unknown
type: Succeeded
`),
}}

for _, tc := range testcases {
t.Run(tc.name, func(t *testing.T) {
start := metav1.NewTime(time.Now())
tc.taskRun.Status.StartTime = &start
pod, err := makePod(tc.taskRun, simpleTask)
d := test.Data{
TaskRuns: []*v1.TaskRun{tc.taskRun},
Tasks: []*v1.Task{simpleTask},
Pods: []*corev1.Pod{pod},
}
testAssets, cancel := getTaskRunController(t, d)
defer cancel()
c := testAssets.Controller
clients := testAssets.Clients

err = c.Reconciler.Reconcile(testAssets.Ctx, getRunName(tc.taskRun))
if err == nil {
t.Errorf("expected error when reconciling completed TaskRun : %v", err)
}
if isRequeueError, requeueDuration := controller.IsRequeueKey(err); !isRequeueError {
t.Errorf("Expected requeue error, but got: %s", err.Error())
} else if requeueDuration < 0 {
t.Errorf("Expected a positive requeue duration but got %s", requeueDuration.String())
}
_, err = clients.Pipeline.TektonV1().TaskRuns(tc.taskRun.Namespace).Get(testAssets.Ctx, tc.taskRun.Name, metav1.GetOptions{})
if err != nil {
t.Errorf("Expected completed TaskRun %s to exist but instead got error when getting it: %v", tc.taskRun.Name, err)
}
})
}
}

func TestReconcileTimeouts(t *testing.T) {
type testCase struct {
name string
Expand Down

0 comments on commit 856fb4f

Please sign in to comment.