tektoncd · tekton-robot · Aug 22, 2024 · Nov 18, 2023 · afrittoli · Aug 2, 2024
diff --git a/cmd/entrypoint/main.go b/cmd/entrypoint/main.go
@@ -17,7 +17,6 @@ limitations under the License.
 package main
 
 import (
-	"context"
 	"encoding/json"
 	"errors"
 	"flag"
@@ -56,6 +55,7 @@ var (
 	stdoutPath          = flag.String("stdout_path", "", "If specified, file to copy stdout to")
 	stderrPath          = flag.String("stderr_path", "", "If specified, file to copy stderr to")
 	breakpointOnFailure = flag.Bool("breakpoint_on_failure", false, "If specified, expect steps to not skip on failure")
+	debugBeforeStep     = flag.Bool("debug_before_step", false, "If specified, wait for a debugger to attach before executing the step")
 	onError             = flag.String("on_error", "", "Set to \"continue\" to ignore an error and continue when a container terminates with a non-zero exit code."+
 		" Set to \"stopAndFail\" to declare a failure with a step error and stop executing the rest of the steps.")
 	stepMetadataDir        = flag.String("step_metadata_dir", "", "If specified, create directory to store the step metadata e.g. /tekton/steps/<step-name>/")
@@ -66,25 +66,8 @@ var (
 
 const (
 	defaultWaitPollingInterval = time.Second
-	breakpointExitSuffix       = ".breakpointexit"
 )
 
-func checkForBreakpointOnFailure(e entrypoint.Entrypointer, breakpointExitPostFile string) {
-	if e.BreakpointOnFailure {
-		if waitErr := e.Waiter.Wait(context.Background(), breakpointExitPostFile, false, false); waitErr != nil {
-			log.Println("error occurred while waiting for " + breakpointExitPostFile + " : " + waitErr.Error())
-		}
-		// get exitcode from .breakpointexit
-		exitCode, readErr := e.BreakpointExitCode(breakpointExitPostFile)
-		// if readErr exists, the exitcode with default to 0 as we would like
-		// to encourage to continue running the next steps in the taskRun
-		if readErr != nil {
-			log.Println("error occurred while reading breakpoint exit code : " + readErr.Error())
-		}
-		os.Exit(exitCode)
-	}
-}
-
 func main() {
 	// Add credential flags originally introduced with our legacy credentials helper
 	// image (creds-init).
@@ -172,6 +155,7 @@ func main() {
 		Timeout:                timeout,
 		StepWhenExpressions:    when,
 		BreakpointOnFailure:    *breakpointOnFailure,
+		DebugBeforeStep:        *debugBeforeStep,
 		OnError:                *onError,
 		StepMetadataDir:        *stepMetadataDir,
 		SpireWorkloadAPI:       spireWorkloadAPI,
@@ -185,8 +169,10 @@ func main() {
 	}
 
 	if err := e.Go(); err != nil {
-		breakpointExitPostFile := e.PostFile + breakpointExitSuffix
 		switch t := err.(type) { //nolint:errorlint // checking for multiple types with errors.As is ugly.
+		case entrypoint.DebugBeforeStepError:
+			log.Println("Skipping execute step script because before step breakpoint fail-continue")
+			os.Exit(1)
 		case entrypoint.SkipError:
 			log.Print("Skipping step because a previous step failed")
 			os.Exit(1)
@@ -210,7 +196,7 @@ func main() {
 			// in both cases has an ExitStatus() method with the
 			// same signature.
 			if status, ok := t.Sys().(syscall.WaitStatus); ok {
-				checkForBreakpointOnFailure(e, breakpointExitPostFile)
+				e.CheckForBreakpointOnFailure()
 				// ignore a step error i.e. do not exit if a container terminates with a non-zero exit code when onError is set to "continue"
 				if e.OnError != entrypoint.ContinueOnError {
 					os.Exit(status.ExitStatus())
@@ -221,7 +207,7 @@ func main() {
 				log.Fatalf("Error executing command (ExitError): %v", err)
 			}
 		default:
-			checkForBreakpointOnFailure(e, breakpointExitPostFile)
+			e.CheckForBreakpointOnFailure()
 			log.Fatalf("Error executing command: %v", err)
 		}
 	}

diff --git a/docs/debug.md b/docs/debug.md
@@ -13,7 +13,8 @@ weight: 108
     - [Breakpoint on Failure](#breakpoint-on-failure)
       - [Failure of a Step](#failure-of-a-step)
       - [Halting a Step on failure](#halting-a-step-on-failure)
-      - [Exiting breakpoint](#exiting-breakpoint)
+      - [Exiting onfailure breakpoint](#exiting-onfailure-breakpoint)
+    - [Breakpoint before step](#breakpoint-before-step)
 - [Debug Environment](#debug-environment)
   - [Mounts](#mounts)
   - [Debug Scripts](#debug-scripts)
@@ -59,12 +60,26 @@ stopping write of the `<step-no>.err` file and waiting on a signal by the user t
 In this breakpoint, which is essentially a limbo state the TaskRun finds itself in, the user can interact with the step 
 environment using a CLI or an IDE. 
 
-#### Exiting breakpoint
+#### Exiting onfailure breakpoint
 
 To exit a step which has been paused upon failure, the step would wait on a file similar to `<step-no>.breakpointexit` which 
 would unpause and exit the step container. eg: Step 0 fails and is paused. Writing `0.breakpointexit` in `/tekton/run`
 would unpause and exit the step container.
 
+### Breakpoint before step
+
+
+TaskRun will be stuck waiting for user debugging before the step execution.
+When beforeStep-Breakpoint takes effect, the user can see the following information
+from the corresponding step container log:
+```
+debug before step breakpoint has taken effect, waiting for user's decision:
+1) continue, use cmd: /tekton/debug/scripts/debug-beforestep-continue
+2) fail-continue, use cmd: /tekton/debug/scripts/debug-beforestep-fail-continue
+```
+1. Executing /tekton/debug/scripts/debug-beforestep-continue will continue to execute the step program
+2. Executing /tekton/debug/scripts/debug-beforestep-fail-continue will not continue to execute the task, and will mark the step as failed
+
 ## Debug Environment 
 
 Additional environment augmentations made available to the TaskRun Pod to aid in troubleshooting and managing step lifecycle.
@@ -80,7 +95,13 @@ to reflect step number. eg: Step 0 will have `/tekton/debug/info/0`, Step 1 will
 ### Debug Scripts
 
 `/tekton/debug/scripts/debug-continue` : Mark the step as completed with success by writing to `/tekton/run`. eg: User wants to exit
-breakpoint for failed step 0. Running this script would create `/tekton/run/0` and `/tekton/run/0.breakpointexit`.
+onfailure breakpoint for failed step 0. Running this script would create `/tekton/run/0` and `/tekton/run/0/out.breakpointexit`.
 
 `/tekton/debug/scripts/debug-fail-continue` : Mark the step as completed with failure by writing to `/tekton/run`. eg: User wants to exit
-breakpoint for failed step 0. Running this script would create `/tekton/run/0.err` and `/tekton/run/0.breakpointexit`.
+onfailure breakpoint for failed step 0. Running this script would create `/tekton/run/0` and `/tekton/run/0/out.breakpointexit.err`.
+
+`/tekton/debug/scripts/debug-beforestep-continue` : Mark the step continue to execute by writing to `/tekton/run`. eg: User wants to exit
+before step breakpoint for before step 0. Running this script would create `/tekton/run/0` and `/tekton/run/0/out.beforestepexit`.
+
+`/tekton/debug/scripts/debug-beforestep-fail-continue` : Mark the step not continue to execute by writing to `/tekton/run`. eg: User wants to exit
+before step breakpoint for before step 0. Running this script would create `/tekton/run/0` and `/tekton/run/0/out.beforestepexit.err`.
diff --git a/docs/developers/taskruns.md b/docs/developers/taskruns.md
@@ -284,4 +284,54 @@ There are known issues with the existing implementation of sidecars:
   but an Error when the sidecar exits with an error. This is only apparent when
   using `kubectl` to get the pods of a TaskRun, not when describing the Pod
   using `kubectl describe pod ...` nor when looking at the TaskRun, but can be
-  quite confusing.
+  quite confusing.
+
+## Breakpoint on Failure
+
+Halting a TaskRun execution on Failure of a step.
+
+### Failure of a Step
+
+The entrypoint binary is used to manage the lifecycle of a step. Steps are aligned beforehand by the TaskRun controller
+allowing each step to run in a particular order. This is done using `-wait_file` and the `-post_file` flags. The former
+let's the entrypoint binary know that it has to wait on creation of a particular file before starting execution of the step.
+And the latter provides information on the step number and signal the next step on completion of the step.
+
+On success of a step, the `-post-file` is written as is, signalling the next step which would have the same argument given
+for `-wait_file` to resume the entrypoint process and move ahead with the step.
+
+On failure of a step, the `-post_file` is written with appending `.err` to it denoting that the previous step has failed with
+and error. The subsequent steps are skipped in this case as well, marking the TaskRun as a failure.
+
+### Halting a Step on failure
+
+The failed step writes `<step-no>.err` to `/tekton/run` and stops running completely. To be able to debug a step we would
+need it to continue running (not exit), not skip the next steps and signal health of the step. By disabling step skipping,
+stopping write of the `<step-no>.err` file and waiting on a signal by the user to disable the halt, we would be simulating a
+"breakpoint".
+
+In this breakpoint, which is essentially a limbo state the TaskRun finds itself in, the user can interact with the step
+environment using a CLI or an IDE.
+
+### Exiting onfailure breakpoint
+
+To exit a step which has been paused upon failure, the step would wait on a file similar to `<step-no>.breakpointexit` which
+would unpause and exit the step container. eg: Step 0 fails and is paused. Writing `0.breakpointexit` in `/tekton/run`
+would unpause and exit the step container.
+
+## Breakpoint before step
+
+TaskRun will be stuck waiting for user debugging before the step execution.
+
+### Halting a Step before execution
+
+The step program will be executed after all the `-wait_file` monitoring ends. If want the user to enter the debugging before the step is executed,
+need to pass a parameter `debug_before_step` to `entrypoint`,
+and `entrypoint` will end the monitoring of `waitFiles` back pause,
+waiting to listen to the `/tekton/run/0/out.beforestepexit` file
+
+### Exiting before step breakpoint
+
+`entrypoint` listening `/tekton/run/{{ stepID }}/out.beforestepexit` or `/tekton/run/{{ stepID }}/out.beforestepexit.err` to
+decide whether to proceed this step, `out.beforestepexit` means continue with step,
+`out.beforestepexit.err` means do not continue with the step.
diff --git a/docs/pipeline-api.md b/docs/pipeline-api.md
@@ -5142,6 +5142,17 @@ string
 failed step will not exit</p>
 </td>
 </tr>
+<tr>
+<td>
+<code>beforeSteps</code><br/>
+<em>
+[]string
+</em>
+</td>
+<td>
+<em>(Optional)</em>
+</td>
+</tr>
 </tbody>
 </table>
 <h3 id="tekton.dev/v1.TaskKind">TaskKind
@@ -14926,6 +14937,17 @@ string
 failed step will not exit</p>
 </td>
 </tr>
+<tr>
+<td>
+<code>beforeSteps</code><br/>
+<em>
+[]string
+</em>
+</td>
+<td>
+<em>(Optional)</em>
+</td>
+</tr>
 </tbody>
 </table>
 <h3 id="tekton.dev/v1beta1.TaskKind">TaskKind

diff --git a/docs/taskruns.md b/docs/taskruns.md
@@ -909,6 +909,18 @@ spec:
       onFailure: "enabled"
 ```
 
+### Breakpoint before step
+
+If you want to set a breakpoint before the step is executed, you can add the step name to the `beforeSteps` field in the following way:
+
+```yaml
+spec:
+  debug:
+    breakpoints:
+      beforeSteps: 
+        - {{ stepName }}
+```
+
 Upon failure of a step, the TaskRun Pod execution is halted. If this TaskRun Pod continues to run without any lifecycle
 change done by the user (running the debug-continue or debug-fail-continue script) the TaskRun would be subject to
 [TaskRunTimeout](#configuring-the-failure-timeout).
@@ -931,6 +943,10 @@ perform :-
 
 `debug-fail-continue`: Mark the step as a failure and exit the breakpoint.
 
+`debug-beforestep-continue`: Mark the step continue to execute
+
+`debug-beforestep-fail-continue`: Mark the step not continue to execute
+
 *More information on the inner workings of debug can be found in the [Debug documentation](debug.md)*
 
 ## Code examples

diff --git a/pkg/apis/pipeline/v1/openapi_generated.go b/pkg/apis/pipeline/v1/openapi_generated.go
diff --git a/pkg/apis/pipeline/v1/swagger.json b/pkg/apis/pipeline/v1/swagger.json
@@ -1821,6 +1821,14 @@
       "description": "TaskBreakpoints defines the breakpoint config for a particular Task",
       "type": "object",
       "properties": {
+        "beforeSteps": {
+          "type": "array",
+          "items": {
+            "type": "string",
+            "default": ""
+          },
+          "x-kubernetes-list-type": "atomic"
+        },
         "onFailure": {
           "description": "if enabled, pause TaskRun on failure of a step failed step will not exit",
           "type": "string"

diff --git a/pkg/apis/pipeline/v1/taskrun_types.go b/pkg/apis/pipeline/v1/taskrun_types.go
@@ -26,6 +26,7 @@ import (
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 	"k8s.io/apimachinery/pkg/runtime/schema"
 	"k8s.io/apimachinery/pkg/types"
+	"k8s.io/apimachinery/pkg/util/sets"
 	"k8s.io/utils/clock"
 	"knative.dev/pkg/apis"
 	duckv1 "knative.dev/pkg/apis/duck/v1"
@@ -121,6 +122,9 @@ type TaskBreakpoints struct {
 	// failed step will not exit
 	// +optional
 	OnFailure string `json:"onFailure,omitempty"`
+	// +optional
+	// +listType=atomic
+	BeforeSteps []string `json:"beforeSteps,omitempty"`
 }
 
 // NeedsDebugOnFailure return true if the TaskRun is configured to debug on failure
@@ -131,14 +135,28 @@ func (trd *TaskRunDebug) NeedsDebugOnFailure() bool {
 	return trd.Breakpoints.OnFailure == EnabledOnFailureBreakpoint
 }
 
+// NeedsDebugBeforeStep return true if the step is configured to debug before execution
+func (trd *TaskRunDebug) NeedsDebugBeforeStep(stepName string) bool {
+	if trd.Breakpoints == nil {
+		return false
+	}
+	beforeStepSets := sets.NewString(trd.Breakpoints.BeforeSteps...)
+	return beforeStepSets.Has(stepName)
+}
+
 // StepNeedsDebug return true if the step is configured to debug
 func (trd *TaskRunDebug) StepNeedsDebug(stepName string) bool {
-	return trd.NeedsDebugOnFailure()
+	return trd.NeedsDebugOnFailure() || trd.NeedsDebugBeforeStep(stepName)
 }
 
 // NeedsDebug return true if defined onfailure or have any before, after steps
 func (trd *TaskRunDebug) NeedsDebug() bool {
-	return trd.NeedsDebugOnFailure()
+	return trd.NeedsDebugOnFailure() || trd.HaveBeforeSteps()
+}
+
+// HaveBeforeSteps return true if have any before steps
+func (trd *TaskRunDebug) HaveBeforeSteps() bool {
+	return trd.Breakpoints != nil && len(trd.Breakpoints.BeforeSteps) > 0
 }
 
 // TaskRunInputs holds the input values that this task was invoked with.