Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Document ReconcileResult type (fixes #1419) #1463

Merged
merged 6 commits into from
Jan 14, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 3 additions & 16 deletions controllers/k8ssandra/medusa_reconciler.go
Original file line number Diff line number Diff line change
Expand Up @@ -109,11 +109,7 @@ func (r *K8ssandraClusterReconciler) reconcileMedusa(
return result.Error(err)
}
purgeCronJob.SetLabels(labels.CleanedUpByLabels(kcKey))
recRes := reconciliation.ReconcileObject(ctx, remoteClient, r.DefaultDelay, *purgeCronJob)
switch {
case recRes.IsError():
return recRes
case recRes.IsRequeue():
if recRes := reconciliation.ReconcileObject(ctx, remoteClient, r.DefaultDelay, *purgeCronJob); recRes.Completed() {
return recRes
}
} else {
Expand Down Expand Up @@ -173,12 +169,7 @@ func (r *K8ssandraClusterReconciler) reconcileMedusaSecrets(
return result.Error(err)
}

res := r.reconcileRemoteBucketSecretsDeprecated(ctx, r.ClientCache.GetLocalClient(), kc, logger)
switch {
case res.IsError():
logger.Error(res.GetError(), "Failed to reconcile Medusa bucket secrets")
return res
case res.IsRequeue():
if res := r.reconcileRemoteBucketSecretsDeprecated(ctx, r.ClientCache.GetLocalClient(), kc, logger); res.Completed() {
return res
}
}
Expand All @@ -202,11 +193,7 @@ func (r *K8ssandraClusterReconciler) reconcileMedusaConfigMap(
desiredConfigMap := medusa.CreateMedusaConfigMap(namespace, kc.SanitizedName(), medusaIni)
kcKey := utils.GetKey(kc)
desiredConfigMap.SetLabels(labels.CleanedUpByLabels(kcKey))
recRes := reconciliation.ReconcileObject(ctx, remoteClient, r.DefaultDelay, *desiredConfigMap)
switch {
case recRes.IsError():
return recRes
case recRes.IsRequeue():
if recRes := reconciliation.ReconcileObject(ctx, remoteClient, r.DefaultDelay, *desiredConfigMap); recRes.Completed() {
return recRes
}
}
Expand Down
6 changes: 1 addition & 5 deletions controllers/k8ssandra/vector.go
Original file line number Diff line number Diff line change
Expand Up @@ -41,11 +41,7 @@ func (r *K8ssandraClusterReconciler) reconcileVector(

// Check if the vector config map already exists
desiredVectorConfigMap.SetLabels(labels.CleanedUpByLabels(kcKey))
recRes := reconciliation.ReconcileObject(ctx, remoteClient, r.DefaultDelay, *desiredVectorConfigMap)
switch {
case recRes.IsError():
return recRes
case recRes.IsRequeue():
if recRes := reconciliation.ReconcileObject(ctx, remoteClient, r.DefaultDelay, *desiredVectorConfigMap); recRes.Completed() {
return recRes
}
} else {
Expand Down
9 changes: 2 additions & 7 deletions controllers/reaper/vector.go
Original file line number Diff line number Diff line change
Expand Up @@ -42,14 +42,9 @@ func (r *ReaperReconciler) reconcileVectorConfigMap(
dcLogger.Error(err, "Failed to set controller reference on new Reaper Vector ConfigMap", "ConfigMap", configMapKey)
return ctrl.Result{}, err
}
recRes := reconciliation.ReconcileObject(ctx, remoteClient, r.DefaultDelay, *desiredVectorConfigMap)
switch {
case recRes.IsError():
return ctrl.Result{}, recRes.GetError()
case recRes.IsRequeue():
return ctrl.Result{RequeueAfter: r.DefaultDelay}, nil
if recRes := reconciliation.ReconcileObject(ctx, remoteClient, r.DefaultDelay, *desiredVectorConfigMap); recRes.Completed() {
return recRes.Output()
}

} else {
if err := shared.DeleteConfigMapIfExists(ctx, remoteClient, configMapKey, dcLogger); err != nil {
return ctrl.Result{}, err
Expand Down
12 changes: 4 additions & 8 deletions controllers/stargate/stargate_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -160,7 +160,7 @@ func (r *StargateReconciler) Reconcile(ctx context.Context, req ctrl.Request) (c
if stargateConfigResult, err := r.reconcileStargateConfigMap(ctx, stargate, dcConfig, userStargateCassandraYaml, userStargateCqlYaml, req.Namespace, *actualDc, logger); err != nil {
return ctrl.Result{}, err
} else {
if stargateConfigResult.Requeue {
if stargateConfigResult.Requeue || stargateConfigResult.RequeueAfter > 0 {
return ctrl.Result{RequeueAfter: r.DefaultDelay}, nil
}
}
Expand Down Expand Up @@ -198,7 +198,7 @@ func (r *StargateReconciler) Reconcile(ctx context.Context, req ctrl.Request) (c
// reconcile Vector configmap
if vectorReconcileResult, err := r.reconcileVectorConfigMap(ctx, *stargate, actualDc, r.Client, logger); err != nil {
return vectorReconcileResult, err
} else if vectorReconcileResult.Requeue {
} else if vectorReconcileResult.Requeue || vectorReconcileResult.RequeueAfter > 0 {
return vectorReconcileResult, nil
}

Expand Down Expand Up @@ -446,12 +446,8 @@ func (r *StargateReconciler) reconcileStargateConfigMap(
return ctrl.Result{}, err
}

recRes := reconciliation.ReconcileObject(ctx, r.Client, r.DefaultDelay, *desiredConfigMap)
switch {
case recRes.IsError():
return ctrl.Result{}, recRes.GetError()
case recRes.IsRequeue():
return ctrl.Result{RequeueAfter: r.DefaultDelay}, nil
if recRes := reconciliation.ReconcileObject(ctx, r.Client, r.DefaultDelay, *desiredConfigMap); recRes.Completed() {
return recRes.Output()
}
logger.Info("Stargate ConfigMap successfully reconciled")
return ctrl.Result{}, nil
Expand Down
5 changes: 5 additions & 0 deletions controllers/stargate/stargate_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ package stargate
import (
"context"
"encoding/json"
"os"
"testing"
"time"

Expand Down Expand Up @@ -39,6 +40,10 @@ const (
var managementApiFactory = &testutils.FakeManagementApiFactory{}

func TestStargate(t *testing.T) {

os.Setenv("REQUEUE_DEFAULT_DELAY", "10ms")
os.Setenv("REQUEUE_LONG_DELAY", "10ms")

ctx := testutils.TestSetup(t)
ctx, cancel := context.WithCancel(ctx)
testEnv := &testutils.TestEnv{}
Expand Down
8 changes: 2 additions & 6 deletions controllers/stargate/vector.go
Original file line number Diff line number Diff line change
Expand Up @@ -43,12 +43,8 @@ func (r *StargateReconciler) reconcileVectorConfigMap(
dcLogger.Error(err, "Failed to set controller reference on new Stargate Vector ConfigMap", "ConfigMap", configMapKey)
return ctrl.Result{}, err
}
recRes := reconciliation.ReconcileObject(ctx, remoteClient, r.DefaultDelay, *desiredVectorConfigMap)
switch {
case recRes.IsError():
return ctrl.Result{}, recRes.GetError()
case recRes.IsRequeue():
return ctrl.Result{RequeueAfter: r.DefaultDelay}, nil
if recRes := reconciliation.ReconcileObject(ctx, remoteClient, r.DefaultDelay, *desiredVectorConfigMap); recRes.Completed() {
return recRes.Output()
}
} else {
if err := deleteConfigMapIfExists(ctx, remoteClient, configMapKey, dcLogger); err != nil {
Expand Down
2 changes: 1 addition & 1 deletion pkg/medusa/refresh_secrets.go
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ func RefreshSecrets(dc *cassdcapi.CassandraDatacenter, ctx context.Context, clie
case recRes.IsRequeue():
requeues++
continue
case recRes.IsDone():
case !recRes.Completed():
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Changed this to keep the exact same behavior, but the overall function has issues: the block if requeues > 0 { ... } at line 58 is never reached (either with this version or the original ones), so retries are not behaving as expected.

Copy link
Member

@Miles-Garnsey Miles-Garnsey Nov 28, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think you're right, we should drop the continue statements and move the check of requeues outside the loop for this to make sense.

continue
}
if requeues > 0 {
Expand Down
27 changes: 18 additions & 9 deletions pkg/reconciliation/generic.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,18 @@ type Reconcileable[T any] interface {
*T
}

// Try with U, a type of any whose POINTER still fulfils Reoncilable...
// ReconcileObject ensures that desiredObject exists in the given state, either by creating it, or updating it if it
// already exists.
func ReconcileObject[U any, T Reconcileable[U]](ctx context.Context, kClient client.Client, requeueDelay time.Duration, desiredObject U) result.ReconcileResult {
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[thought] This is off-topic, but I don't understand why this function schedules a requeue after a successful creation.

Copy link
Member

@Miles-Garnsey Miles-Garnsey Nov 28, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Because that's the pattern followed throughout the codebase. Object creation is asynchronous, acceptance by the API server does not necessarily mean that something won't go wrong downstream of that. As a result, we check again for the success of the creation operation before continuing, which indeed does mean a requeue.

There is also stuff to do with optimistic locking and the way the caches work on the controller which make this necessary - try removing it if you're curious as to what breaks.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Object creation is asynchronous, acceptance by the API server does not necessarily mean that something won't go wrong downstream of that.

Is that documented anywhere? The client package docs do not go into many details.

Copy link
Member

@Miles-Garnsey Miles-Garnsey Nov 28, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That's just how asynchronous HTTP APIs work, they return a status code but sometimes object creation might fail out elsewhere.

But I think the more critical thing here is that (from memory) due to some of the caching behaviour of these clients, the object will not appear if you try to go do a Get in some circumstances. I encourage you to try to remove the requeues and see if things break - I believe they do.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't know, this example in the controller-runtime codebase doesn't requeue. They reference pod.CreationTimestamp, a field that is set by the server during the creation operation, immediately after a successful Create() call.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

By "extra" requeues I meant the one after a successful Create, and the one after a successful Update. I pushed the change so you can check the diff.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That means that (while I'm wrong on the consistency level of writes), I'm right on reads, and some of our logic will probably fail due to failure to refresh the cache.

This depends on what is read after the object was written. If it's something that the Kubernetes API returns, then it's consistent right away. For example, Olivier's example of using creationTimestamp, probably resourceVersion and uid etc are directly accessible in the object you just posted.

However, for many use cases, what happens is that there's other controllers doing some extra processing and then modifying the object on the Kubernetes side (as a lot of Kubernetes processing itself is async with controllers). Can't access or Update .Status for example for some objects, like Pods. Or in our runtimes, if k8ssandra-operator creates CassandraDatacenter, it probably can't Update the Status field of CassandraDatacenter even instantly after, because cass-operator has modified it and the resourceVersions no longer match.

The other thing that should be understood is that Read after Write is cached, but of course Write itself updates the object that was sent. So doing:

obj = Create(obj)
a := obj.ObjectMeta.uid
// and if nothing else is touching our object, we can patch it afterwards also.

Is valid. However, doing:

obj = Create(obj)
obj = Get(obj)
a := obj.ObjectMeta.uid

Is not valid. The Get() call is coming from cache and would probably not return an object at all. Using Reader interface from manager will allow doing this as it isn't cached, but we don't really use that interface a lot (I think only ClientConfigs does that).

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What we're saying here is that a requeue may or may not be needed, depending on the case.
If the requeue itself is harmless, just adding a little delay, but is protecting the code from potential bugs we might as well keep it, don't we?
@olim7t, for the sake of moving this PR forward and keeping its scope to (mostly) the original one, I'd suggest to rollback this change and have a separate discussion for it.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What we're saying here is that a requeue may or may not be needed, depending on the case.

This thread has turned into a generic conversation about requeues, but we are in fact looking at a very specific case. I've only made two changes related to requeues in this PR:

  • ReconcileObject tries to create the object and succeeds
  • ReconcileObject tries to update the object and succeeds

(it's all in Don't requeue successful writes in ReconcileObject if anyone wants to look at the diff)

If we accept the assertion that writes are synchronous then there's absolutely no need to requeue. ReconcileObject wanted to apply changes, those changes are applied, we're done.

Other controllers and cached reads are valid concerns that we encounter in other places, but not here.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Writes being synchronous, there doesn't seem to be a valid reason to force a requeue for each object creation.
That logic can be offloaded to the specific code (as opposed to this generic reconciler) that will deal with what happens after the creation/update of the object.
Let's move forward and not requeue as part of the generic reconciler. @Miles-Garnsey , sounds good?

recResult, _ := ReconcileAndGetObject[U, T](ctx, kClient, requeueDelay, desiredObject)
return recResult
}

// ReconcileAndGetObject ensures that desiredObject exists in the given state, either by creating it, or updating it if
// it already exists. It returns the current state of the object on the server after the reconciliation.
func ReconcileAndGetObject[U any, T Reconcileable[U]](
ctx context.Context, kClient client.Client, requeueDelay time.Duration, desiredObject U,
) (result.ReconcileResult, *U) {
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I propose we add this variant, it's almost the same but returns the state of the object on the server after the reconciliation.
This is useful in two cases:

  • the hash annotation matched so the object was not updated, but we're interested in a field that was updated independently by another controller. For example the IP of a LoadBalancer service in its status.
  • we're interested in a field that is set by the write operation itself, for example CreationTimestamp.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this solves some problems but might be overoptimising - it introduces more complexity than I'd like and relies on folks understanding the caching behaviour. Based on the conversation here I think that is tricky for even experienced users and I'd rather avoid callers having to make more decisions about those subtleties.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

relies on folks understanding the caching behaviour

How so? We're either returning the result of a read that observed matching hash annotations, or the result of a create or update (not subject to caching).

If we don't have this variant, we'll need to resort to ReconcileObject followed by a separate Get. If anything, it sounds like it could produce more caching issues (Get not seeing the result of a write in ReconcileObject).

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If we don't have this variant, we'll need to resort to ReconcileObject followed by a separate Get. If anything, it sounds like it could produce more caching issues (Get not seeing the result of a write in ReconcileObject).

In that behavior, we would need to reconcile and wait for the cache to have that object (and potentially reconcile again, fail etc). Otherwise, we need to use the Create/Update one, but never Create+Get or Update+Get combo.

So it makes sense to get the object back if user knows what to do with it. Actually, why would we even have any non-returning version? Create/Update always return the object also.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm with @burmanm on this. The Get call you're making here won't actually result in a cache update so this variant isn't going to solve the problem.

When I say that folks would need to understand the caching behaviour, what I mean is that they need to know that the object returned by this new variant is not necessarily up to date. They also need to reason about what fields are up to date, which is non-trivial in my view.

I am in favour of just always re-queuing so that we elide these complexities.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

After chatting with @burmanm, it looks like all 3 of us (Olivier, Micke and myself) agree that a requeue shouldn't be done systematically. Let's then move forward with what @olim7t did here, and follow up with a refactoring that merges this new ReconcileAndGetObject() function into just ReconcileObject(), changing the signature to also return the object. That'll require a few changes in the code that calls this function to ignore the object when it's not needed.
If a requeue is needed, one should know and add it after the call to ReconcileObject().
If this causes bugs, they should surface in our tests, otherwise it means we're missing some tests.

objectKey := types.NamespacedName{
Name: T(&desiredObject).GetName(),
Namespace: T(&desiredObject).GetNamespace(),
Expand All @@ -34,24 +44,23 @@ func ReconcileObject[U any, T Reconcileable[U]](ctx context.Context, kClient cli
if errors.IsNotFound(err) {
if err := kClient.Create(ctx, T(&desiredObject)); err != nil {
if errors.IsAlreadyExists(err) {
return result.RequeueSoon(requeueDelay)
return result.RequeueSoon(requeueDelay), nil
}
return result.Error(err)
return result.Error(err), nil
}
return result.RequeueSoon(requeueDelay)
} else {
return result.Error(err)
return result.Continue(), &desiredObject
}
return result.Error(err), nil
}

if !annotations.CompareHashAnnotations(T(currentCm), T(&desiredObject)) {
resourceVersion := T(currentCm).GetResourceVersion()
T(&desiredObject).DeepCopyInto(currentCm)
T(currentCm).SetResourceVersion(resourceVersion)
if err := kClient.Update(ctx, T(currentCm)); err != nil {
return result.Error(err)
return result.Error(err), nil
}
return result.RequeueSoon(requeueDelay)
return result.Continue(), currentCm
}
return result.Done()
return result.Continue(), currentCm
}
11 changes: 5 additions & 6 deletions pkg/reconciliation/generic_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,8 @@ func Test_ReconcileObject_UpdateDone(t *testing.T) {
kClient := testutils.NewFakeClientWRestMapper() // Reset the Client
// Launch reconciliation.
recRes := ReconcileObject(ctx, kClient, requeueDelay, desiredObject)
assert.True(t, recRes.IsRequeue())
// Should update immediately and signal we can continue
assert.False(t, recRes.Completed())
// After the update we should see the expected ConfigMap
afterUpdateCM := &corev1.ConfigMap{}
err := kClient.Get(ctx,
Expand All @@ -42,15 +43,13 @@ func Test_ReconcileObject_UpdateDone(t *testing.T) {
},
afterUpdateCM)
assert.NoError(t, err)
// If we reconcile again, we should move into the Done state.
recRes = ReconcileObject(ctx, kClient, requeueDelay, desiredObject)
assert.True(t, recRes.IsDone())
}

func Test_ReconcileObject_CreateSuccess(t *testing.T) {
kClient := testutils.NewFakeClientWRestMapper() // Reset the Client
recRes := ReconcileObject(ctx, kClient, requeueDelay, desiredObject)
assert.True(t, recRes.IsRequeue())
// Should create immediately and signal we can continue
assert.False(t, recRes.Completed())
actualCm := &corev1.ConfigMap{}
err := kClient.Get(ctx, types.NamespacedName{Name: desiredObject.Name, Namespace: desiredObject.Namespace}, actualCm)
assert.NoError(t, err)
Expand All @@ -75,7 +74,7 @@ func Test_ReconcileObject_UpdateSuccess(t *testing.T) {
}
// Launch reconciliation.
recRes := ReconcileObject(ctx, kClient, requeueDelay, desiredObject)
assert.True(t, recRes.IsRequeue())
assert.False(t, recRes.Completed())
annotations.AddHashAnnotation(&desiredObject)
// After the update we should see the expected ConfigMap
afterUpdateCM := &corev1.ConfigMap{}
Expand Down
55 changes: 52 additions & 3 deletions pkg/result/result_helper.go
Original file line number Diff line number Diff line change
@@ -1,20 +1,56 @@
package result

import (
"sigs.k8s.io/controller-runtime/pkg/reconcile"
"time"

ctrl "sigs.k8s.io/controller-runtime"
)

// Copyright DataStax, Inc.
// Please see the included license file for details.

// This is just so that we can reference TerminalError in the Godoc of [Error]
var _ error = reconcile.TerminalError(nil)

// ReconcileResult represents the result of a step in the reconciliation process.
//
// We typically split the top-level Reconcile() method of a controller into multiple sub-functions. Each of these
// functions uses ReconcileResult to communicate to its caller how the current iteration should proceed. There are 4
// possible implementations: [Continue], [Done], [RequeueSoon], and [Error].
//
// The idiomatic way to handle a ReconcileResult in an intermediary sub-function is:
//
// if recResult := callStep1(); recResult.Completed() {
// // Global success, requeue or error: stop what we're doing and propagate up
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Issue: based on the logic you've described here it sounded like a requeue (which is not a success) will also be Completed()?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, a requeue is Completed():

func (c callBackSoon) Completed() bool {
	return true
}

But I don't see any issue here: if a step wants a requeue, we stop what we're doing and propagate up. The top-level Reconcile() will eventually return recResult.Output() (which is ctrl.Result{Requeue: true, RequeueAfter: c.after}, nil) to the controller runtime.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Let me put my concern in another way. Right now, we have a struct implementation of this interface instantiated with Done() which is also Completed(). Requeue() and Error() are also Completed(). And yet, the English language meaning of the word Completed() is exactly the same as the meaning of the word Done().

To me, having a getter function Completed() for a field across three different struct implementations ( Done(), Requeue() and Error() - some of which do not mean "Done" in the linguistic sense) is completely untenable.

If you want to continue having something which means "this reconciliation run has progressed as far as possible, and needs to jump back to the top level of the reconciliation to be handled appropriately", we need a different word to Completed(). At that point, I think we can have a more productive conversation about what behaviours we are trying to model. But until we can express that intention in a function name, I would rather retain the explicit switch statements we have for Done(), Requeue() and `Error(), since this makes it explicit what behaviours we are handling.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is kind of why I think this needs a design doc too - because at the moment it isn't clear to me what behaviour or states we are trying to model, and we need to discuss and agree a target state rather than just say that we're "documenting" existing behaviour but then making quite impactful changes to the core of the reconciliation logic.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we need to discuss and agree a target state rather than just say that we're "documenting" existing behaviour

Yes, the goal of this PR is to have that discussion. I proposed an initial version, let's iterate on it with reviews until we reach a common understanding.

this needs a design doc too

Are the go-docs not self-sufficient? Not that I mind having a separate document, but I wouldn't know what else to put in there.

And yet, the English language meaning of the word Completed() is exactly the same as the meaning of the word Done().

I've deliberately avoided going down the naming rabbithole so far... May I suggest that we keep that for a second step? I feel like it will get too messy if we rename as we go.
(but yes I agree with most of your comments)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This isn't about documentation, it is about the development process. We do deep discussions in design docs, not on PRs. If we choose to do it differently here, I'm concerned that we will end up in the same situation that we ended up in when we started making changes to the name overrides in labels and downstream resources, where the blast radius of the change was not adequately considered and customers suffered issues as a result.

I strongly advocate for the creation of a design doc for this piece of work to avoid customer impacts.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

And yet, the English language meaning of the word Completed() is exactly the same as the meaning of the word Done().

What is the English language definition difference of while and while do then? Yet, semantics for those two are different and in every programming language. One has while executed after first execution, while one has while executed before execution.

This isn't English.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I agree with Miles, and let's not forget many of the people participating in these projects are not necessarily english native speakers. This API brings a level of confusion that is high enough to require having that conversation. There's overlap between Completed() (which should really be IsCompleted()), Done() and Continue() among other things which require to take a break and think of what this API is solving and how it should do it.
The fact that it's been used successfully (which is debatable) for the past few years isn't a valid reason to push back on the discussion.
I also think it's fair to have a separate doc, in the form of a markdown file, contributed as a PR so that the community can see it, to describe our view on the reconcile process and how ReconcileResult should be used.
"Design doc" might be an overstatement here and "doc" would probably be enough.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

which should really be IsCompleted()

Ugly Java POJO, but this isn't a Java project. We don't need to indicate it's a boolean when a function has return value of boolean. Same as with "Get" when fetching an object.

I also think it's fair to have a separate doc, in the form of a markdown file, contributed as a PR so that the community can see it

Why can't the users view what's defined in the godoc? It would be publicly available and generated already, like currently:

https://pkg.go.dev/github.com/k8ssandra/[email protected]/pkg/reconciliation

What Olivier wrote here would appear there. We can add examples, descriptions etc. Adding markdown doc somewhere assumes users know to search for it (and hope it's not outdated) instead of the "standard" place.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In an effort to move forward again, let's just use these in code comments as docs on how to use this interface.
The comments will be available in the IDE and in the godocs, without requiring to look at an external doc.
We can add more information here if we feel there's not enough for anyone to build a good understanding of how to use it.
We can extract this into a doc if that makes sense to anyone, but let's not make that mandatory.

// return recResult
// }
// // Otherwise, proceed with the next step(s)
// if recResult := callStep2(); recResult.Completed() {
// // etc...
//
// The idiomatic way to handle a ReconcileResult in the top-level Reconcile() method is:
//
// recResult := callSomeSubmethod()
// // Possibly inspect the result (e.g. to set an error field in the status)
// return recResult.Output()
type ReconcileResult interface {
// Completed indicates that the current iteration of the reconciliation loop is complete, and the top-level
// Reconcile() method should return [ReconcileResult.Output] to the controller runtime.
//
// This returns true for a [Done] or terminal [Error] (where the output will stop the entire reconciliation loop);
// and for a [RequeueSoon] or regular [Error] (where the output will trigger a retry).
//
// This returns false for a [Continue].
Completed() bool
// Output converts this result into a format that the main Reconcile() method can return to the controller runtime.
//
// Calling this method on a [Continue] will panic.
Output() (ctrl.Result, error)
// IsError indicates whether this result is an [Error].
IsError() bool
// IsRequeue indicates whether this result is a [RequeueSoon].
IsRequeue() bool
// IsDone indicates whether this result is a [Done].
IsDone() bool
// GetError returns the wrapped error if the result is an [Error], otherwise it returns nil.
GetError() error
}

Expand Down Expand Up @@ -121,18 +157,31 @@ func (r errorOut) GetError() error {
return r.err
}

// Continue indicates that the current step in the reconciliation is done. The caller should proceed with the next step.
func Continue() ReconcileResult {
return continueReconcile{}
}

// Done indicates that the entire reconciliation loop was successful.
Copy link
Member

@Miles-Garnsey Miles-Garnsey Nov 28, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Issue: from a design perspective, this seems tricky, since only the last step should ever call Done() based on this description. No earlier step would ever know that the entire reconcilation was Done() from what I can see?

And what is the effect of calling Done()? What does the caller do that is special as a result of seeing that result type?

I think if we're getting into this topic, we should make Done() more descriptive - perhaps it needs a rename to AllReconciliationDone() or something similar to that, which indicates that all reconciliation steps to reconcile a given object (from the req passed to the controller) is done.

We need to be explicit about WHAT is done - that's the issue we're currently struggling with. Naming these implementations correctly might be a good starting point.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No earlier step would ever know that the entire reconcilation was Done() from what I can see?

This is exactly what it is, the reconciling process is done. It's not even really a necessary process didn't end in a pkg, but in the Reconcile() function. Just return reconcile.Result{}, nil.

We need to be explicit about WHAT is done - that's the issue we're currently struggling with. Naming these implementations correctly might be a good starting point.

Are we having issues? This API didn't even begin here and the original usage had no issues. I'm not convinced we need this ReconcileResult API at all, but then that's not what we're discussing here.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Are we having issues? This API didn't even begin here and the original usage had no issues. I'm not convinced we need this ReconcileResult API at all, but then that's not what we're discussing here.

Maybe this should be something we're discussing. We are definitely having issues because there is no consistency in the usage of this API in the codebase and there are regular disagreements on what Done, Continue, and Completed mean, as there have been for several years.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We are definitely having issues because there is no consistency in the usage of this API in the codebase and there are regular disagreements on what Done, Continue, and Completed mean, as there have been for several years.

No, there hasn't been (and even in this discussion, it doesn't seem like there's that much issue for rest of the team). This API was copied over from cass-operator and it had a clear definition there how things worked. This isn't new API that needs design documents (which are utterly pointless at this point of using this API), just couple of fixes.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No, there hasn't been

At the very least there's been some ambiguity, as evidenced by the changes we are doing in this PR to align different parts of the code.

it had a clear definition there how things worked

This might be your longer experience of cass-operator speaking, but personally I wouldn't call this API clear, it took way more effort to understand than it should.
Anyway, whether it is clear or not, I don't see any downside to documenting the intent.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Let's focus on whether or not we agree with the comments that @olim7t added on each function.
@burmanm @Miles-Garnsey, do these comments look good?

Let's not change names for now and just build a common understanding.

// The caller should skip the next steps (if any), and propagate the result up the stack. This will eventually reach the
// top-level Reconcile() method, which should stop the reconciliation.
func Done() ReconcileResult {
return done{}
}

// RequeueSoon indicates that the current step in the reconciliation requires a requeue after a certain amount of time.
// The caller should skip the next steps (if any), and propagate the result up the stack. This will eventually reach the
// top-level Reconcile() method, which should schedule a requeue with the given delay.
func RequeueSoon(after time.Duration) ReconcileResult {
return callBackSoon{after: after}
}

// Error indicates that the current step in the reconciliation has failed.
// The caller should skip the next steps (if any), and propagate the result up the stack. This will eventually reach the
// top-level Reconcile() method, which should return the error to the controller runtime.
//
// If the argument is wrapped with [reconcile.TerminalError], the reconciliation loop will stop; otherwise, it will be
// retried with exponential backoff.
func Error(e error) ReconcileResult {
return errorOut{err: e}
}
6 changes: 1 addition & 5 deletions pkg/telemetry/cassandra_agent/cassandra_agent_config.go
Original file line number Diff line number Diff line change
Expand Up @@ -158,11 +158,7 @@ func (c Configurator) ReconcileTelemetryAgentConfig(dc *cassdcapi.CassandraDatac
}
desiredCm.SetLabels(labels.CleanedUpByLabels(KlKey))

recRes := reconciliation.ReconcileObject(c.Ctx, c.RemoteClient, c.RequeueDelay, *desiredCm)
switch {
case recRes.IsError():
return recRes
case recRes.IsRequeue():
if recRes := reconciliation.ReconcileObject(c.Ctx, c.RemoteClient, c.RequeueDelay, *desiredCm); recRes.Completed() {
return recRes
}

Expand Down
13 changes: 11 additions & 2 deletions test/e2e/suite_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -842,7 +842,7 @@ func createSingleDatacenterCluster(t *testing.T, ctx context.Context, namespace
require.NoError(err, "failed to patch K8ssandraCluster in namespace %s", namespace)
checkStargateReady(t, f, ctx, stargateKey)
checkStargateK8cStatusReady(t, f, ctx, kcKey, dcKey)
checkContainerPresence(t, ctx, f, stargateDeploymentKey, k8ssandra, getPodTemplateSpec, stargate.VectorContainerName)
checkContainerPresenceEventually(t, ctx, f, stargateDeploymentKey, k8ssandra, getPodTemplateSpec, stargate.VectorContainerName)
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same issue as in unit tests: because the requeue is not ignored anymore, this is taking longer than before and subject to timing issues.

checkVectorAgentConfigMapPresence(t, ctx, f, dcKey, stargate.VectorAgentConfigMapName)

t.Log("check that if Stargate is deleted directly it gets re-created")
Expand All @@ -853,7 +853,7 @@ func createSingleDatacenterCluster(t *testing.T, ctx context.Context, namespace
require.NoError(err, "failed to delete Stargate in namespace %s", namespace)
checkStargateReady(t, f, ctx, stargateKey)

checkContainerPresence(t, ctx, f, stargateDeploymentKey, k8ssandra, getPodTemplateSpec, stargate.VectorContainerName)
checkContainerPresenceEventually(t, ctx, f, stargateDeploymentKey, k8ssandra, getPodTemplateSpec, stargate.VectorContainerName)
checkVectorAgentConfigMapPresence(t, ctx, f, dcKey, stargate.VectorAgentConfigMapName)

t.Log("delete Stargate in k8ssandracluster resource")
Expand Down Expand Up @@ -2324,6 +2324,15 @@ func checkContainerPresence(t *testing.T, ctx context.Context, f *framework.E2eF
require.True(t, containerFound, "cannot find Container in pod template spec")
}

func checkContainerPresenceEventually(t *testing.T, ctx context.Context, f *framework.E2eFramework, podKey framework.ClusterKey, kc *api.K8ssandraCluster, specFunction func(t *testing.T, ctx context.Context, f *framework.E2eFramework, dcKey framework.ClusterKey, kc *api.K8ssandraCluster) *corev1.PodTemplateSpec, containerName string) {
t.Logf("check that %s contains Container named %s", podKey.Name, containerName)
require.Eventually(t, func() bool {
podTempSpec := specFunction(t, ctx, f, podKey, kc)
_, containerFound := cassandra.FindContainer(podTempSpec, containerName)
return containerFound
}, polling.stargateReady.timeout, polling.stargateReady.interval, "cannot find Container in pod template spec")
}

func checkContainerDeleted(t *testing.T, ctx context.Context, f *framework.E2eFramework, podKey framework.ClusterKey, kc *api.K8ssandraCluster, specFunction func(t *testing.T, ctx context.Context, f *framework.E2eFramework, dcKey framework.ClusterKey, kc *api.K8ssandraCluster) *corev1.PodTemplateSpec, containerName string) {
t.Logf("check that %s does not have a Container named %s", podKey.Name, containerName)
podTempSpec := specFunction(t, ctx, f, podKey, kc)
Expand Down
Loading