Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

TEP-0097 breakpoint before steps for taskrun #7391

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 7 additions & 21 deletions cmd/entrypoint/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@ limitations under the License.
package main

import (
"context"
"encoding/json"
"errors"
"flag"
Expand Down Expand Up @@ -56,6 +55,7 @@ var (
stdoutPath = flag.String("stdout_path", "", "If specified, file to copy stdout to")
stderrPath = flag.String("stderr_path", "", "If specified, file to copy stderr to")
breakpointOnFailure = flag.Bool("breakpoint_on_failure", false, "If specified, expect steps to not skip on failure")
debugBeforeStep = flag.Bool("debug_before_step", false, "If specified, wait for a debugger to attach before executing the step")
onError = flag.String("on_error", "", "Set to \"continue\" to ignore an error and continue when a container terminates with a non-zero exit code."+
" Set to \"stopAndFail\" to declare a failure with a step error and stop executing the rest of the steps.")
stepMetadataDir = flag.String("step_metadata_dir", "", "If specified, create directory to store the step metadata e.g. /tekton/steps/<step-name>/")
Expand All @@ -66,25 +66,8 @@ var (

const (
defaultWaitPollingInterval = time.Second
breakpointExitSuffix = ".breakpointexit"
)

func checkForBreakpointOnFailure(e entrypoint.Entrypointer, breakpointExitPostFile string) {
if e.BreakpointOnFailure {
if waitErr := e.Waiter.Wait(context.Background(), breakpointExitPostFile, false, false); waitErr != nil {
log.Println("error occurred while waiting for " + breakpointExitPostFile + " : " + waitErr.Error())
}
// get exitcode from .breakpointexit
exitCode, readErr := e.BreakpointExitCode(breakpointExitPostFile)
// if readErr exists, the exitcode with default to 0 as we would like
// to encourage to continue running the next steps in the taskRun
if readErr != nil {
log.Println("error occurred while reading breakpoint exit code : " + readErr.Error())
}
os.Exit(exitCode)
}
}

func main() {
// Add credential flags originally introduced with our legacy credentials helper
// image (creds-init).
Expand Down Expand Up @@ -172,6 +155,7 @@ func main() {
Timeout: timeout,
StepWhenExpressions: when,
BreakpointOnFailure: *breakpointOnFailure,
DebugBeforeStep: *debugBeforeStep,
OnError: *onError,
StepMetadataDir: *stepMetadataDir,
SpireWorkloadAPI: spireWorkloadAPI,
Expand All @@ -185,8 +169,10 @@ func main() {
}

if err := e.Go(); err != nil {
breakpointExitPostFile := e.PostFile + breakpointExitSuffix
switch t := err.(type) { //nolint:errorlint // checking for multiple types with errors.As is ugly.
case entrypoint.DebugBeforeStepError:
log.Println("Skipping execute step script because before step breakpoint fail-continue")
os.Exit(1)
case entrypoint.SkipError:
log.Print("Skipping step because a previous step failed")
os.Exit(1)
Expand All @@ -210,7 +196,7 @@ func main() {
// in both cases has an ExitStatus() method with the
// same signature.
if status, ok := t.Sys().(syscall.WaitStatus); ok {
checkForBreakpointOnFailure(e, breakpointExitPostFile)
e.CheckForBreakpointOnFailure()
Comment on lines -213 to +199
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nice, but it seems unrelated to this PR.
Is the reason for making CheckForBreakpointOnFailure public is to provide unit test coverage?

// ignore a step error i.e. do not exit if a container terminates with a non-zero exit code when onError is set to "continue"
if e.OnError != entrypoint.ContinueOnError {
os.Exit(status.ExitStatus())
Expand All @@ -221,7 +207,7 @@ func main() {
log.Fatalf("Error executing command (ExitError): %v", err)
}
default:
checkForBreakpointOnFailure(e, breakpointExitPostFile)
e.CheckForBreakpointOnFailure()
log.Fatalf("Error executing command: %v", err)
}
}
Expand Down
29 changes: 25 additions & 4 deletions docs/debug.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,8 @@ weight: 108
- [Breakpoint on Failure](#breakpoint-on-failure)
- [Failure of a Step](#failure-of-a-step)
- [Halting a Step on failure](#halting-a-step-on-failure)
- [Exiting breakpoint](#exiting-breakpoint)
- [Exiting onfailure breakpoint](#exiting-onfailure-breakpoint)
- [Breakpoint before step](#breakpoint-before-step)
- [Debug Environment](#debug-environment)
- [Mounts](#mounts)
- [Debug Scripts](#debug-scripts)
Expand Down Expand Up @@ -59,12 +60,26 @@ stopping write of the `<step-no>.err` file and waiting on a signal by the user t
In this breakpoint, which is essentially a limbo state the TaskRun finds itself in, the user can interact with the step
environment using a CLI or an IDE.

#### Exiting breakpoint
#### Exiting onfailure breakpoint

To exit a step which has been paused upon failure, the step would wait on a file similar to `<step-no>.breakpointexit` which
would unpause and exit the step container. eg: Step 0 fails and is paused. Writing `0.breakpointexit` in `/tekton/run`
would unpause and exit the step container.

### Breakpoint before step


TaskRun will be stuck waiting for user debugging before the step execution.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

NIT: The TaskRun will be wait for use input before starting the execution of the step

When beforeStep-Breakpoint takes effect, the user can see the following information
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

NIT: what's "beforeStep-Breakpoint"? If it is the name of a configuration option or variable please use the inline code-block, else please use a description in plain english i.e. "When the breakpoint before step takes effect"

from the corresponding step container log:
```
debug before step breakpoint has taken effect, waiting for user's decision:
1) continue, use cmd: /tekton/debug/scripts/debug-beforestep-continue
2) fail-continue, use cmd: /tekton/debug/scripts/debug-beforestep-fail-continue
```
1. Executing /tekton/debug/scripts/debug-beforestep-continue will continue to execute the step program
2. Executing /tekton/debug/scripts/debug-beforestep-fail-continue will not continue to execute the task, and will mark the step as failed
Comment on lines +80 to +81
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

NIT: please wrap the file-paths with inline code blocks (e.g. `code block`)


## Debug Environment

Additional environment augmentations made available to the TaskRun Pod to aid in troubleshooting and managing step lifecycle.
Expand All @@ -80,7 +95,13 @@ to reflect step number. eg: Step 0 will have `/tekton/debug/info/0`, Step 1 will
### Debug Scripts

`/tekton/debug/scripts/debug-continue` : Mark the step as completed with success by writing to `/tekton/run`. eg: User wants to exit
breakpoint for failed step 0. Running this script would create `/tekton/run/0` and `/tekton/run/0.breakpointexit`.
onfailure breakpoint for failed step 0. Running this script would create `/tekton/run/0` and `/tekton/run/0/out.breakpointexit`.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

NIT: breakpoint on failure


`/tekton/debug/scripts/debug-fail-continue` : Mark the step as completed with failure by writing to `/tekton/run`. eg: User wants to exit
breakpoint for failed step 0. Running this script would create `/tekton/run/0.err` and `/tekton/run/0.breakpointexit`.
onfailure breakpoint for failed step 0. Running this script would create `/tekton/run/0` and `/tekton/run/0/out.breakpointexit.err`.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ditto


`/tekton/debug/scripts/debug-beforestep-continue` : Mark the step continue to execute by writing to `/tekton/run`. eg: User wants to exit
before step breakpoint for before step 0. Running this script would create `/tekton/run/0` and `/tekton/run/0/out.beforestepexit`.

`/tekton/debug/scripts/debug-beforestep-fail-continue` : Mark the step not continue to execute by writing to `/tekton/run`. eg: User wants to exit
before step breakpoint for before step 0. Running this script would create `/tekton/run/0` and `/tekton/run/0/out.beforestepexit.err`.
52 changes: 51 additions & 1 deletion docs/developers/taskruns.md
Original file line number Diff line number Diff line change
Expand Up @@ -284,4 +284,54 @@ There are known issues with the existing implementation of sidecars:
but an Error when the sidecar exits with an error. This is only apparent when
using `kubectl` to get the pods of a TaskRun, not when describing the Pod
using `kubectl describe pod ...` nor when looking at the TaskRun, but can be
quite confusing.
quite confusing.

## Breakpoint on Failure

Halting a TaskRun execution on Failure of a step.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

NIT: s/Failure/failure


### Failure of a Step

The entrypoint binary is used to manage the lifecycle of a step. Steps are aligned beforehand by the TaskRun controller
allowing each step to run in a particular order. This is done using `-wait_file` and the `-post_file` flags. The former
let's the entrypoint binary know that it has to wait on creation of a particular file before starting execution of the step.
And the latter provides information on the step number and signal the next step on completion of the step.

On success of a step, the `-post-file` is written as is, signalling the next step which would have the same argument given
for `-wait_file` to resume the entrypoint process and move ahead with the step.

On failure of a step, the `-post_file` is written with appending `.err` to it denoting that the previous step has failed with
and error. The subsequent steps are skipped in this case as well, marking the TaskRun as a failure.

### Halting a Step on failure

The failed step writes `<step-no>.err` to `/tekton/run` and stops running completely. To be able to debug a step we would
need it to continue running (not exit), not skip the next steps and signal health of the step. By disabling step skipping,
stopping write of the `<step-no>.err` file and waiting on a signal by the user to disable the halt, we would be simulating a
"breakpoint".

In this breakpoint, which is essentially a limbo state the TaskRun finds itself in, the user can interact with the step
environment using a CLI or an IDE.

### Exiting onfailure breakpoint

To exit a step which has been paused upon failure, the step would wait on a file similar to `<step-no>.breakpointexit` which
would unpause and exit the step container. eg: Step 0 fails and is paused. Writing `0.breakpointexit` in `/tekton/run`
would unpause and exit the step container.

## Breakpoint before step

TaskRun will be stuck waiting for user debugging before the step execution.

### Halting a Step before execution

The step program will be executed after all the `-wait_file` monitoring ends. If want the user to enter the debugging before the step is executed,
need to pass a parameter `debug_before_step` to `entrypoint`,
and `entrypoint` will end the monitoring of `waitFiles` back pause,
waiting to listen to the `/tekton/run/0/out.beforestepexit` file

### Exiting before step breakpoint

`entrypoint` listening `/tekton/run/{{ stepID }}/out.beforestepexit` or `/tekton/run/{{ stepID }}/out.beforestepexit.err` to
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

NIT: entrypoint looks for /tekton...

decide whether to proceed this step, `out.beforestepexit` means continue with step,
`out.beforestepexit.err` means do not continue with the step.
22 changes: 22 additions & 0 deletions docs/pipeline-api.md
Original file line number Diff line number Diff line change
Expand Up @@ -5142,6 +5142,17 @@ string
failed step will not exit</p>
</td>
</tr>
<tr>
<td>
<code>beforeSteps</code><br/>
<em>
[]string
</em>
</td>
<td>
<em>(Optional)</em>
</td>
</tr>
</tbody>
</table>
<h3 id="tekton.dev/v1.TaskKind">TaskKind
Expand Down Expand Up @@ -14926,6 +14937,17 @@ string
failed step will not exit</p>
</td>
</tr>
<tr>
<td>
<code>beforeSteps</code><br/>
<em>
[]string
</em>
</td>
<td>
<em>(Optional)</em>
</td>
</tr>
</tbody>
</table>
<h3 id="tekton.dev/v1beta1.TaskKind">TaskKind
Expand Down
16 changes: 16 additions & 0 deletions docs/taskruns.md
Original file line number Diff line number Diff line change
Expand Up @@ -909,6 +909,18 @@ spec:
onFailure: "enabled"
```

### Breakpoint before step

If you want to set a breakpoint before the step is executed, you can add the step name to the `beforeSteps` field in the following way:

```yaml
spec:
debug:
breakpoints:
beforeSteps:
- {{ stepName }}
```

Upon failure of a step, the TaskRun Pod execution is halted. If this TaskRun Pod continues to run without any lifecycle
change done by the user (running the debug-continue or debug-fail-continue script) the TaskRun would be subject to
[TaskRunTimeout](#configuring-the-failure-timeout).
Expand All @@ -931,6 +943,10 @@ perform :-

`debug-fail-continue`: Mark the step as a failure and exit the breakpoint.

`debug-beforestep-continue`: Mark the step continue to execute

`debug-beforestep-fail-continue`: Mark the step not continue to execute

*More information on the inner workings of debug can be found in the [Debug documentation](debug.md)*

## Code examples
Expand Down
19 changes: 19 additions & 0 deletions pkg/apis/pipeline/v1/openapi_generated.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

8 changes: 8 additions & 0 deletions pkg/apis/pipeline/v1/swagger.json
Original file line number Diff line number Diff line change
Expand Up @@ -1821,6 +1821,14 @@
"description": "TaskBreakpoints defines the breakpoint config for a particular Task",
"type": "object",
"properties": {
"beforeSteps": {
"type": "array",
"items": {
"type": "string",
"default": ""
},
"x-kubernetes-list-type": "atomic"
},
"onFailure": {
"description": "if enabled, pause TaskRun on failure of a step failed step will not exit",
"type": "string"
Expand Down
22 changes: 20 additions & 2 deletions pkg/apis/pipeline/v1/taskrun_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ import (
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/runtime/schema"
"k8s.io/apimachinery/pkg/types"
"k8s.io/apimachinery/pkg/util/sets"
"k8s.io/utils/clock"
"knative.dev/pkg/apis"
duckv1 "knative.dev/pkg/apis/duck/v1"
Expand Down Expand Up @@ -121,6 +122,9 @@ type TaskBreakpoints struct {
// failed step will not exit
// +optional
OnFailure string `json:"onFailure,omitempty"`
// +optional
// +listType=atomic
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

NIT: I guess this could be "set" as well, since it doesn't make sense to have the same step twice, and it's also enforece at validation time

BeforeSteps []string `json:"beforeSteps,omitempty"`
}

// NeedsDebugOnFailure return true if the TaskRun is configured to debug on failure
Expand All @@ -131,14 +135,28 @@ func (trd *TaskRunDebug) NeedsDebugOnFailure() bool {
return trd.Breakpoints.OnFailure == EnabledOnFailureBreakpoint
}

// NeedsDebugBeforeStep return true if the step is configured to debug before execution
func (trd *TaskRunDebug) NeedsDebugBeforeStep(stepName string) bool {
if trd.Breakpoints == nil {
return false
}
beforeStepSets := sets.NewString(trd.Breakpoints.BeforeSteps...)
return beforeStepSets.Has(stepName)
}

// StepNeedsDebug return true if the step is configured to debug
func (trd *TaskRunDebug) StepNeedsDebug(stepName string) bool {
return trd.NeedsDebugOnFailure()
return trd.NeedsDebugOnFailure() || trd.NeedsDebugBeforeStep(stepName)
}

// NeedsDebug return true if defined onfailure or have any before, after steps
func (trd *TaskRunDebug) NeedsDebug() bool {
return trd.NeedsDebugOnFailure()
return trd.NeedsDebugOnFailure() || trd.HaveBeforeSteps()
}

// HaveBeforeSteps return true if have any before steps
func (trd *TaskRunDebug) HaveBeforeSteps() bool {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

NIT: to be consistent with other method names, this should be HasBeforeSteps

return trd.Breakpoints != nil && len(trd.Breakpoints.BeforeSteps) > 0
}

// TaskRunInputs holds the input values that this task was invoked with.
Expand Down
Loading