Skip to content

Commit

Permalink
TEP-0097 breakpoint before steps for taskrun
Browse files Browse the repository at this point in the history
Signed-off-by: chengjoey <[email protected]>
  • Loading branch information
chengjoey committed Aug 2, 2024
1 parent dd36119 commit 02a269e
Show file tree
Hide file tree
Showing 28 changed files with 910 additions and 134 deletions.
28 changes: 7 additions & 21 deletions cmd/entrypoint/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@ limitations under the License.
package main

import (
"context"
"encoding/json"
"errors"
"flag"
Expand Down Expand Up @@ -56,6 +55,7 @@ var (
stdoutPath = flag.String("stdout_path", "", "If specified, file to copy stdout to")
stderrPath = flag.String("stderr_path", "", "If specified, file to copy stderr to")
breakpointOnFailure = flag.Bool("breakpoint_on_failure", false, "If specified, expect steps to not skip on failure")
debugBeforeStep = flag.Bool("debug_before_step", false, "If specified, wait for a debugger to attach before executing the step")
onError = flag.String("on_error", "", "Set to \"continue\" to ignore an error and continue when a container terminates with a non-zero exit code."+
" Set to \"stopAndFail\" to declare a failure with a step error and stop executing the rest of the steps.")
stepMetadataDir = flag.String("step_metadata_dir", "", "If specified, create directory to store the step metadata e.g. /tekton/steps/<step-name>/")
Expand All @@ -66,25 +66,8 @@ var (

const (
defaultWaitPollingInterval = time.Second
breakpointExitSuffix = ".breakpointexit"
)

func checkForBreakpointOnFailure(e entrypoint.Entrypointer, breakpointExitPostFile string) {
if e.BreakpointOnFailure {
if waitErr := e.Waiter.Wait(context.Background(), breakpointExitPostFile, false, false); waitErr != nil {
log.Println("error occurred while waiting for " + breakpointExitPostFile + " : " + waitErr.Error())
}
// get exitcode from .breakpointexit
exitCode, readErr := e.BreakpointExitCode(breakpointExitPostFile)
// if readErr exists, the exitcode with default to 0 as we would like
// to encourage to continue running the next steps in the taskRun
if readErr != nil {
log.Println("error occurred while reading breakpoint exit code : " + readErr.Error())
}
os.Exit(exitCode)
}
}

func main() {
// Add credential flags originally introduced with our legacy credentials helper
// image (creds-init).
Expand Down Expand Up @@ -172,6 +155,7 @@ func main() {
Timeout: timeout,
StepWhenExpressions: when,
BreakpointOnFailure: *breakpointOnFailure,
DebugBeforeStep: *debugBeforeStep,
OnError: *onError,
StepMetadataDir: *stepMetadataDir,
SpireWorkloadAPI: spireWorkloadAPI,
Expand All @@ -185,8 +169,10 @@ func main() {
}

if err := e.Go(); err != nil {
breakpointExitPostFile := e.PostFile + breakpointExitSuffix
switch t := err.(type) { //nolint:errorlint // checking for multiple types with errors.As is ugly.
case entrypoint.DebugBeforeStepError:
log.Println("Skipping execute step script because before step breakpoint fail-continue")
os.Exit(1)
case entrypoint.SkipError:
log.Print("Skipping step because a previous step failed")
os.Exit(1)
Expand All @@ -210,7 +196,7 @@ func main() {
// in both cases has an ExitStatus() method with the
// same signature.
if status, ok := t.Sys().(syscall.WaitStatus); ok {
checkForBreakpointOnFailure(e, breakpointExitPostFile)
e.CheckForBreakpointOnFailure()
// ignore a step error i.e. do not exit if a container terminates with a non-zero exit code when onError is set to "continue"
if e.OnError != entrypoint.ContinueOnError {
os.Exit(status.ExitStatus())
Expand All @@ -221,7 +207,7 @@ func main() {
log.Fatalf("Error executing command (ExitError): %v", err)
}
default:
checkForBreakpointOnFailure(e, breakpointExitPostFile)
e.CheckForBreakpointOnFailure()
log.Fatalf("Error executing command: %v", err)
}
}
Expand Down
29 changes: 25 additions & 4 deletions docs/debug.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,8 @@ weight: 108
- [Breakpoint on Failure](#breakpoint-on-failure)
- [Failure of a Step](#failure-of-a-step)
- [Halting a Step on failure](#halting-a-step-on-failure)
- [Exiting breakpoint](#exiting-breakpoint)
- [Exiting onfailure breakpoint](#exiting-onfailure-breakpoint)
- [Breakpoint before step](#breakpoint-before-step)
- [Debug Environment](#debug-environment)
- [Mounts](#mounts)
- [Debug Scripts](#debug-scripts)
Expand Down Expand Up @@ -59,12 +60,26 @@ stopping write of the `<step-no>.err` file and waiting on a signal by the user t
In this breakpoint, which is essentially a limbo state the TaskRun finds itself in, the user can interact with the step
environment using a CLI or an IDE.

#### Exiting breakpoint
#### Exiting onfailure breakpoint

To exit a step which has been paused upon failure, the step would wait on a file similar to `<step-no>.breakpointexit` which
would unpause and exit the step container. eg: Step 0 fails and is paused. Writing `0.breakpointexit` in `/tekton/run`
would unpause and exit the step container.

### Breakpoint before step


TaskRun will be stuck waiting for user debugging before the step execution.
When beforeStep-Breakpoint takes effect, the user can see the following information
from the corresponding step container log:
```
debug before step breakpoint has taken effect, waiting for user's decision:
1) continue, use cmd: /tekton/debug/scripts/debug-beforestep-continue
2) fail-continue, use cmd: /tekton/debug/scripts/debug-beforestep-fail-continue
```
1. Executing /tekton/debug/scripts/debug-beforestep-continue will continue to execute the step program
2. Executing /tekton/debug/scripts/debug-beforestep-fail-continue will not continue to execute the task, and will mark the step as failed

## Debug Environment

Additional environment augmentations made available to the TaskRun Pod to aid in troubleshooting and managing step lifecycle.
Expand All @@ -80,7 +95,13 @@ to reflect step number. eg: Step 0 will have `/tekton/debug/info/0`, Step 1 will
### Debug Scripts

`/tekton/debug/scripts/debug-continue` : Mark the step as completed with success by writing to `/tekton/run`. eg: User wants to exit
breakpoint for failed step 0. Running this script would create `/tekton/run/0` and `/tekton/run/0.breakpointexit`.
onfailure breakpoint for failed step 0. Running this script would create `/tekton/run/0` and `/tekton/run/0/out.breakpointexit`.

`/tekton/debug/scripts/debug-fail-continue` : Mark the step as completed with failure by writing to `/tekton/run`. eg: User wants to exit
breakpoint for failed step 0. Running this script would create `/tekton/run/0.err` and `/tekton/run/0.breakpointexit`.
onfailure breakpoint for failed step 0. Running this script would create `/tekton/run/0` and `/tekton/run/0/out.breakpointexit.err`.

`/tekton/debug/scripts/debug-beforestep-continue` : Mark the step continue to execute by writing to `/tekton/run`. eg: User wants to exit
before step breakpoint for before step 0. Running this script would create `/tekton/run/0` and `/tekton/run/0/out.beforestepexit`.

`/tekton/debug/scripts/debug-beforestep-fail-continue` : Mark the step not continue to execute by writing to `/tekton/run`. eg: User wants to exit
before step breakpoint for before step 0. Running this script would create `/tekton/run/0` and `/tekton/run/0/out.beforestepexit.err`.
52 changes: 51 additions & 1 deletion docs/developers/taskruns.md
Original file line number Diff line number Diff line change
Expand Up @@ -284,4 +284,54 @@ There are known issues with the existing implementation of sidecars:
but an Error when the sidecar exits with an error. This is only apparent when
using `kubectl` to get the pods of a TaskRun, not when describing the Pod
using `kubectl describe pod ...` nor when looking at the TaskRun, but can be
quite confusing.
quite confusing.
## Breakpoint on Failure
Halting a TaskRun execution on Failure of a step.
### Failure of a Step
The entrypoint binary is used to manage the lifecycle of a step. Steps are aligned beforehand by the TaskRun controller
allowing each step to run in a particular order. This is done using `-wait_file` and the `-post_file` flags. The former
let's the entrypoint binary know that it has to wait on creation of a particular file before starting execution of the step.
And the latter provides information on the step number and signal the next step on completion of the step.
On success of a step, the `-post-file` is written as is, signalling the next step which would have the same argument given
for `-wait_file` to resume the entrypoint process and move ahead with the step.
On failure of a step, the `-post_file` is written with appending `.err` to it denoting that the previous step has failed with
and error. The subsequent steps are skipped in this case as well, marking the TaskRun as a failure.
### Halting a Step on failure
The failed step writes `<step-no>.err` to `/tekton/run` and stops running completely. To be able to debug a step we would
need it to continue running (not exit), not skip the next steps and signal health of the step. By disabling step skipping,
stopping write of the `<step-no>.err` file and waiting on a signal by the user to disable the halt, we would be simulating a
"breakpoint".
In this breakpoint, which is essentially a limbo state the TaskRun finds itself in, the user can interact with the step
environment using a CLI or an IDE.
### Exiting onfailure breakpoint
To exit a step which has been paused upon failure, the step would wait on a file similar to `<step-no>.breakpointexit` which
would unpause and exit the step container. eg: Step 0 fails and is paused. Writing `0.breakpointexit` in `/tekton/run`
would unpause and exit the step container.
## Breakpoint before step
TaskRun will be stuck waiting for user debugging before the step execution.
### Halting a Step before execution
The step program will be executed after all the `-wait_file` monitoring ends. If want the user to enter the debugging before the step is executed,
need to pass a parameter `debug_before_step` to `entrypoint`,
and `entrypoint` will end the monitoring of `waitFiles` back pause,
waiting to listen to the `/tekton/run/0/out.beforestepexit` file
### Exiting before step breakpoint
`entrypoint` listening `/tekton/run/{{ stepID }}/out.beforestepexit` or `/tekton/run/{{ stepID }}/out.beforestepexit.err` to
decide whether to proceed this step, `out.beforestepexit` means continue with step,
`out.beforestepexit.err` means do not continue with the step.
22 changes: 22 additions & 0 deletions docs/pipeline-api.md
Original file line number Diff line number Diff line change
Expand Up @@ -5142,6 +5142,17 @@ string
failed step will not exit</p>
</td>
</tr>
<tr>
<td>
<code>beforeSteps</code><br/>
<em>
[]string
</em>
</td>
<td>
<em>(Optional)</em>
</td>
</tr>
</tbody>
</table>
<h3 id="tekton.dev/v1.TaskKind">TaskKind
Expand Down Expand Up @@ -14926,6 +14937,17 @@ string
failed step will not exit</p>
</td>
</tr>
<tr>
<td>
<code>beforeSteps</code><br/>
<em>
[]string
</em>
</td>
<td>
<em>(Optional)</em>
</td>
</tr>
</tbody>
</table>
<h3 id="tekton.dev/v1beta1.TaskKind">TaskKind
Expand Down
16 changes: 16 additions & 0 deletions docs/taskruns.md
Original file line number Diff line number Diff line change
Expand Up @@ -909,6 +909,18 @@ spec:
onFailure: "enabled"
```

### Breakpoint before step

If you want to set a breakpoint before the step is executed, you can add the step name to the `beforeSteps` field in the following way:

```yaml
spec:
debug:
breakpoints:
beforeSteps:
- {{ stepName }}
```

Upon failure of a step, the TaskRun Pod execution is halted. If this TaskRun Pod continues to run without any lifecycle
change done by the user (running the debug-continue or debug-fail-continue script) the TaskRun would be subject to
[TaskRunTimeout](#configuring-the-failure-timeout).
Expand All @@ -931,6 +943,10 @@ perform :-

`debug-fail-continue`: Mark the step as a failure and exit the breakpoint.

`debug-beforestep-continue`: Mark the step continue to execute

`debug-beforestep-fail-continue`: Mark the step not continue to execute

*More information on the inner workings of debug can be found in the [Debug documentation](debug.md)*

## Code examples
Expand Down
19 changes: 19 additions & 0 deletions pkg/apis/pipeline/v1/openapi_generated.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

8 changes: 8 additions & 0 deletions pkg/apis/pipeline/v1/swagger.json
Original file line number Diff line number Diff line change
Expand Up @@ -1821,6 +1821,14 @@
"description": "TaskBreakpoints defines the breakpoint config for a particular Task",
"type": "object",
"properties": {
"beforeSteps": {
"type": "array",
"items": {
"type": "string",
"default": ""
},
"x-kubernetes-list-type": "atomic"
},
"onFailure": {
"description": "if enabled, pause TaskRun on failure of a step failed step will not exit",
"type": "string"
Expand Down
22 changes: 20 additions & 2 deletions pkg/apis/pipeline/v1/taskrun_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ import (
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/runtime/schema"
"k8s.io/apimachinery/pkg/types"
"k8s.io/apimachinery/pkg/util/sets"
"k8s.io/utils/clock"
"knative.dev/pkg/apis"
duckv1 "knative.dev/pkg/apis/duck/v1"
Expand Down Expand Up @@ -121,6 +122,9 @@ type TaskBreakpoints struct {
// failed step will not exit
// +optional
OnFailure string `json:"onFailure,omitempty"`
// +optional
// +listType=atomic
BeforeSteps []string `json:"beforeSteps,omitempty"`
}

// NeedsDebugOnFailure return true if the TaskRun is configured to debug on failure
Expand All @@ -131,14 +135,28 @@ func (trd *TaskRunDebug) NeedsDebugOnFailure() bool {
return trd.Breakpoints.OnFailure == EnabledOnFailureBreakpoint
}

// NeedsDebugBeforeStep return true if the step is configured to debug before execution
func (trd *TaskRunDebug) NeedsDebugBeforeStep(stepName string) bool {
if trd.Breakpoints == nil {
return false
}
beforeStepSets := sets.NewString(trd.Breakpoints.BeforeSteps...)
return beforeStepSets.Has(stepName)
}

// StepNeedsDebug return true if the step is configured to debug
func (trd *TaskRunDebug) StepNeedsDebug(stepName string) bool {
return trd.NeedsDebugOnFailure()
return trd.NeedsDebugOnFailure() || trd.NeedsDebugBeforeStep(stepName)
}

// NeedsDebug return true if defined onfailure or have any before, after steps
func (trd *TaskRunDebug) NeedsDebug() bool {
return trd.NeedsDebugOnFailure()
return trd.NeedsDebugOnFailure() || trd.HaveBeforeSteps()
}

// HaveBeforeSteps return true if have any before steps
func (trd *TaskRunDebug) HaveBeforeSteps() bool {
return trd.Breakpoints != nil && len(trd.Breakpoints.BeforeSteps) > 0
}

// TaskRunInputs holds the input values that this task was invoked with.
Expand Down
Loading

0 comments on commit 02a269e

Please sign in to comment.