Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support ArrayNode subNode timeouts #6054

Draft
wants to merge 3 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions flytepropeller/pkg/apis/flyteworkflow/v1alpha1/iface.go
Original file line number Diff line number Diff line change
Expand Up @@ -290,6 +290,7 @@ type ExecutableArrayNodeStatus interface {
GetSubNodeTaskPhases() bitarray.CompactArray
GetSubNodeRetryAttempts() bitarray.CompactArray
GetSubNodeSystemFailures() bitarray.CompactArray
GetSubNodeDeltaTimestamps() bitarray.CompactArray
GetTaskPhaseVersion() uint32
}

Expand All @@ -302,6 +303,7 @@ type MutableArrayNodeStatus interface {
SetSubNodeTaskPhases(subNodeTaskPhases bitarray.CompactArray)
SetSubNodeRetryAttempts(subNodeRetryAttempts bitarray.CompactArray)
SetSubNodeSystemFailures(subNodeSystemFailures bitarray.CompactArray)
SetSubNodeDeltaTimestamps(subNodeDeltaTimestamps bitarray.CompactArray)
SetTaskPhaseVersion(taskPhaseVersion uint32)
}

Expand Down
26 changes: 19 additions & 7 deletions flytepropeller/pkg/apis/flyteworkflow/v1alpha1/node_status.go
Original file line number Diff line number Diff line change
Expand Up @@ -230,13 +230,14 @@

type ArrayNodeStatus struct {
MutableStruct
Phase ArrayNodePhase `json:"phase,omitempty"`
ExecutionError *core.ExecutionError `json:"executionError,omitempty"`
SubNodePhases bitarray.CompactArray `json:"subphase,omitempty"`
SubNodeTaskPhases bitarray.CompactArray `json:"subtphase,omitempty"`
SubNodeRetryAttempts bitarray.CompactArray `json:"subattempts,omitempty"`
SubNodeSystemFailures bitarray.CompactArray `json:"subsysfailures,omitempty"`
TaskPhaseVersion uint32 `json:"taskPhaseVersion,omitempty"`
Phase ArrayNodePhase `json:"phase,omitempty"`
ExecutionError *core.ExecutionError `json:"executionError,omitempty"`
SubNodePhases bitarray.CompactArray `json:"subphase,omitempty"`
SubNodeTaskPhases bitarray.CompactArray `json:"subtphase,omitempty"`
SubNodeRetryAttempts bitarray.CompactArray `json:"subattempts,omitempty"`
SubNodeSystemFailures bitarray.CompactArray `json:"subsysfailures,omitempty"`
SubNodeDeltaTimestamps bitarray.CompactArray `json: "subtimestamps",omitempty"`
TaskPhaseVersion uint32 `json:"taskPhaseVersion,omitempty"`
}

func (in *ArrayNodeStatus) GetArrayNodePhase() ArrayNodePhase {
Expand Down Expand Up @@ -305,6 +306,17 @@
}
}

func (in *ArrayNodeStatus) GetSubNodeDeltaTimestamps() bitarray.CompactArray {
return in.SubNodeDeltaTimestamps

Check warning on line 310 in flytepropeller/pkg/apis/flyteworkflow/v1alpha1/node_status.go

View check run for this annotation

Codecov / codecov/patch

flytepropeller/pkg/apis/flyteworkflow/v1alpha1/node_status.go#L309-L310

Added lines #L309 - L310 were not covered by tests
}

func (in *ArrayNodeStatus) SetSubNodeDeltaTimestamps(subNodeDeltaTimestamps bitarray.CompactArray) {
if in.SubNodeDeltaTimestamps != subNodeDeltaTimestamps {
in.SetDirty()
in.SubNodeDeltaTimestamps = subNodeDeltaTimestamps
}

Check warning on line 317 in flytepropeller/pkg/apis/flyteworkflow/v1alpha1/node_status.go

View check run for this annotation

Codecov / codecov/patch

flytepropeller/pkg/apis/flyteworkflow/v1alpha1/node_status.go#L313-L317

Added lines #L313 - L317 were not covered by tests
}

func (in *ArrayNodeStatus) GetTaskPhaseVersion() uint32 {
return in.TaskPhaseVersion
}
Expand Down
28 changes: 28 additions & 0 deletions flytepropeller/pkg/controller/nodes/array/handler.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,9 @@
"fmt"
"math"
"strconv"
"time"

"k8s.io/apimachinery/pkg/apis/meta/v1"

idlcore "github.com/flyteorg/flyte/flyteidl/gen/pb-go/flyteidl/core"
"github.com/flyteorg/flyte/flyteplugins/go/tasks/pluginmachinery/core"
Expand All @@ -28,6 +31,11 @@
"github.com/flyteorg/flyte/flytestdlib/storage"
)

const (
// value is 3 days of seconds which is covered by 18 bits (262144)
MAX_DELTA_TIMESTAMP = 259200
)

var (
nilLiteral = &idlcore.Literal{
Value: &idlcore.Literal_Scalar{
Expand Down Expand Up @@ -254,6 +262,7 @@
{arrayReference: &arrayNodeState.SubNodeTaskPhases, maxValue: len(core.Phases) - 1},
{arrayReference: &arrayNodeState.SubNodeRetryAttempts, maxValue: maxAttemptsValue},
{arrayReference: &arrayNodeState.SubNodeSystemFailures, maxValue: maxSystemFailuresValue},
{arrayReference: &arrayNodeState.SubNodeDeltaTimestamps, maxValue: MAX_DELTA_TIMESTAMP},
} {

*item.arrayReference, err = bitarray.NewCompactArray(uint(size), bitarray.Item(item.maxValue)) // #nosec G115
Expand Down Expand Up @@ -380,6 +389,18 @@
arrayNodeState.SubNodeRetryAttempts.SetItem(index, uint64(subNodeStatus.GetAttempts()))
arrayNodeState.SubNodeSystemFailures.SetItem(index, uint64(subNodeStatus.GetSystemFailures()))

startedAt := nCtx.NodeStatus().GetLastAttemptStartedAt()
subNodeStartedAt := subNodeStatus.GetLastAttemptStartedAt()
if subNodeStartedAt == nil {
// subNodeStartedAt == nil indicates either (1) node has not started or (2) node status has
// been reset (ex. retryable failure). in both cases we set the delta timestamp to 0
arrayNodeState.SubNodeDeltaTimestamps.SetItem(index, 0)
} else if startedAt != nil && arrayNodeState.SubNodeDeltaTimestamps.GetItem(index) == 0 {
// otherwise if `SubNodeDeltaTimestamps` is unset, we compute the delta and set it
deltaDuration := uint64(subNodeStartedAt.Time.Sub(startedAt.Time).Seconds())
arrayNodeState.SubNodeDeltaTimestamps.SetItem(index, deltaDuration)
}

Check warning on line 402 in flytepropeller/pkg/controller/nodes/array/handler.go

View check run for this annotation

Codecov / codecov/patch

flytepropeller/pkg/controller/nodes/array/handler.go#L399-L402

Added lines #L399 - L402 were not covered by tests

// increment task phase version if subNode phase or task phase changed
if subNodeStatus.GetPhase() != nodeExecutionRequest.nodePhase || subNodeStatus.GetTaskNodeStatus().GetPhase() != nodeExecutionRequest.taskPhase {
incrementTaskPhaseVersion = true
Expand Down Expand Up @@ -767,6 +788,12 @@
return nil, nil, nil, nil, nil, nil, err
}

// compute start time for subNode using delta timestamp from ArrayNode NodeStatus
var startedAt *v1.Time
if deltaSeconds := arrayNodeState.SubNodeDeltaTimestamps.GetItem(subNodeIndex); deltaSeconds != 0 {
startedAt = &v1.Time{Time: nCtx.NodeStatus().GetLastAttemptStartedAt().Add(time.Duration(deltaSeconds) * time.Second)}
}

Check warning on line 795 in flytepropeller/pkg/controller/nodes/array/handler.go

View check run for this annotation

Codecov / codecov/patch

flytepropeller/pkg/controller/nodes/array/handler.go#L794-L795

Added lines #L794 - L795 were not covered by tests

subNodeStatus := &v1alpha1.NodeStatus{
Phase: nodePhase,
DataDir: subDataDir,
Expand All @@ -777,6 +804,7 @@
Phase: taskPhase,
PluginState: pluginStateBytes,
},
LastAttemptStartedAt: startedAt,
}

// initialize mocks
Expand Down
4 changes: 4 additions & 0 deletions flytepropeller/pkg/controller/nodes/array/handler_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -252,6 +252,7 @@ func TestAbort(t *testing.T) {
{arrayReference: &arrayNodeState.SubNodeTaskPhases, maxValue: len(core.Phases) - 1},
{arrayReference: &arrayNodeState.SubNodeRetryAttempts, maxValue: 1},
{arrayReference: &arrayNodeState.SubNodeSystemFailures, maxValue: 1},
{arrayReference: &arrayNodeState.SubNodeDeltaTimestamps, maxValue: 1024},
} {

*item.arrayReference, err = bitarray.NewCompactArray(uint(size), bitarray.Item(item.maxValue)) // #nosec G115
Expand Down Expand Up @@ -348,6 +349,7 @@ func TestFinalize(t *testing.T) {
{arrayReference: &arrayNodeState.SubNodeTaskPhases, maxValue: len(core.Phases) - 1},
{arrayReference: &arrayNodeState.SubNodeRetryAttempts, maxValue: 1},
{arrayReference: &arrayNodeState.SubNodeSystemFailures, maxValue: 1},
{arrayReference: &arrayNodeState.SubNodeDeltaTimestamps, maxValue: 1024},
} {
*item.arrayReference, err = bitarray.NewCompactArray(uint(size), bitarray.Item(item.maxValue)) // #nosec G115
assert.NoError(t, err)
Expand Down Expand Up @@ -858,6 +860,7 @@ func TestHandleArrayNodePhaseExecuting(t *testing.T) {
{arrayReference: &arrayNodeState.SubNodeTaskPhases, maxValue: len(core.Phases) - 1},
{arrayReference: &arrayNodeState.SubNodeRetryAttempts, maxValue: 1},
{arrayReference: &arrayNodeState.SubNodeSystemFailures, maxValue: 1},
{arrayReference: &arrayNodeState.SubNodeDeltaTimestamps, maxValue: 1024},
} {
*item.arrayReference, err = bitarray.NewCompactArray(uint(size), bitarray.Item(item.maxValue)) // #nosec G115
assert.NoError(t, err)
Expand Down Expand Up @@ -1301,6 +1304,7 @@ func TestHandleArrayNodePhaseFailing(t *testing.T) {
{arrayReference: &arrayNodeState.SubNodeTaskPhases, maxValue: len(core.Phases) - 1},
{arrayReference: &arrayNodeState.SubNodeRetryAttempts, maxValue: 1},
{arrayReference: &arrayNodeState.SubNodeSystemFailures, maxValue: 1},
{arrayReference: &arrayNodeState.SubNodeDeltaTimestamps, maxValue: 1024},
} {
*item.arrayReference, err = bitarray.NewCompactArray(uint(len(test.subNodePhases)), bitarray.Item(item.maxValue)) // #nosec G115
assert.NoError(t, err)
Expand Down
15 changes: 8 additions & 7 deletions flytepropeller/pkg/controller/nodes/handler/state.go
Original file line number Diff line number Diff line change
Expand Up @@ -48,11 +48,12 @@ type GateNodeState struct {
}

type ArrayNodeState struct {
Phase v1alpha1.ArrayNodePhase
TaskPhaseVersion uint32
Error *core.ExecutionError
SubNodePhases bitarray.CompactArray
SubNodeTaskPhases bitarray.CompactArray
SubNodeRetryAttempts bitarray.CompactArray
SubNodeSystemFailures bitarray.CompactArray
Phase v1alpha1.ArrayNodePhase
TaskPhaseVersion uint32
Error *core.ExecutionError
SubNodePhases bitarray.CompactArray
SubNodeTaskPhases bitarray.CompactArray
SubNodeRetryAttempts bitarray.CompactArray
SubNodeSystemFailures bitarray.CompactArray
SubNodeDeltaTimestamps bitarray.CompactArray
}
7 changes: 6 additions & 1 deletion flytepropeller/pkg/controller/nodes/node_state_manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@
}

func (n *nodeStateManager) HasGateNodeState() bool {
return n.g != nil
return n.g != nil
}

func (n *nodeStateManager) HasArrayNodeState() bool {
Expand Down Expand Up @@ -181,6 +181,11 @@
if subNodeSystemFailuresCopy := subNodeSystemFailures.DeepCopy(); subNodeSystemFailuresCopy != nil {
as.SubNodeSystemFailures = *subNodeSystemFailuresCopy
}

subNodeDeltaTimestamps := an.GetSubNodeDeltaTimestamps()
if subNodeDeltaTimestampsCopy := subNodeDeltaTimestamps.DeepCopy(); subNodeDeltaTimestampsCopy != nil {
as.SubNodeDeltaTimestamps = *subNodeDeltaTimestampsCopy
}

Check warning on line 188 in flytepropeller/pkg/controller/nodes/node_state_manager.go

View check run for this annotation

Codecov / codecov/patch

flytepropeller/pkg/controller/nodes/node_state_manager.go#L185-L188

Added lines #L185 - L188 were not covered by tests
}
return as
}
Expand Down
1 change: 1 addition & 0 deletions flytepropeller/pkg/controller/nodes/transformers.go
Original file line number Diff line number Diff line change
Expand Up @@ -314,6 +314,7 @@
t.SetSubNodeTaskPhases(na.SubNodeTaskPhases)
t.SetSubNodeRetryAttempts(na.SubNodeRetryAttempts)
t.SetSubNodeSystemFailures(na.SubNodeSystemFailures)
t.SetSubNodeDeltaTimestamps(na.SubNodeDeltaTimestamps)

Check warning on line 317 in flytepropeller/pkg/controller/nodes/transformers.go

View check run for this annotation

Codecov / codecov/patch

flytepropeller/pkg/controller/nodes/transformers.go#L317

Added line #L317 was not covered by tests
t.SetTaskPhaseVersion(na.TaskPhaseVersion)
}
}
Loading