Skip to content

Commit

Permalink
K8OP-294 Do not try to work out backup status if there are no pods (#…
Browse files Browse the repository at this point in the history
…1462)

* K8OP-294 Do not try to work out backup status if there are no pods

* Fix the medusa task controller test

* Do not duplicate the slice with pods

* Return an error, not nil, if backupsummary is not found

* Replace int32 with int when deleting pods in test

* Return nil backup summary for backup with no pods

* Remove get before delete in medusabackupjob_controller_test

* Use a dedicated test backup for the nil backupSummary case

* Return reconcile.TerminalError if getBackupSummary originates the error

* Expect backup with nil summary to actually start
  • Loading branch information
rzvoncek authored Jan 6, 2025
1 parent d00a94b commit a5338e2
Show file tree
Hide file tree
Showing 4 changed files with 138 additions and 62 deletions.
1 change: 1 addition & 0 deletions CHANGELOG/CHANGELOG-1.21.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,3 +19,4 @@ When cutting a new release, update the `unreleased` heading to the tag being gen
* [BUGFIX] [#1383](https://github.com/k8ssandra/k8ssandra-operator/issues/1383) Do not create MedusaBackup if MedusaBakupJob did not fully succeed
* [ENHANCEMENT] [#1667](https://github.com/k8ssahttps://github.com/k8ssandra/k8ssandra/issues/1667) Add `skipSchemaMigration` option to `K8ssandraCluster.spec.reaper`
* [FEATURE] [#1034](https://github.com/k8ssandra/k8ssandra-operator/issues/1034) Add support for priorityClassName
* [BUGFIX] [#1454](https://github.com/k8ssandra/k8ssandra-operator/issues/1454) Do not try to work out backup status if there are no pods
20 changes: 15 additions & 5 deletions controllers/medusa/medusabackupjob_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ import (
"strings"

corev1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/errors"
apiErrors "k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/apimachinery/pkg/types"
Expand All @@ -33,6 +33,7 @@ import (
"sigs.k8s.io/controller-runtime/pkg/controller/controllerutil"
"sigs.k8s.io/controller-runtime/pkg/log"
"sigs.k8s.io/controller-runtime/pkg/predicate"
"sigs.k8s.io/controller-runtime/pkg/reconcile"

"github.com/go-logr/logr"
cassdcapi "github.com/k8ssandra/cass-operator/apis/cassandra/v1beta1"
Expand Down Expand Up @@ -67,7 +68,7 @@ func (r *MedusaBackupJobReconciler) Reconcile(ctx context.Context, req ctrl.Requ
err := r.Get(ctx, req.NamespacedName, instance)
if err != nil {
logger.Error(err, "Failed to get MedusaBackupJob")
if errors.IsNotFound(err) {
if apiErrors.IsNotFound(err) {
return ctrl.Result{}, nil
}
return ctrl.Result{}, err
Expand Down Expand Up @@ -103,6 +104,10 @@ func (r *MedusaBackupJobReconciler) Reconcile(ctx context.Context, req ctrl.Requ
logger.Error(err, "Failed to get datacenter pods")
return ctrl.Result{}, err
}
if len(pods) == 0 {
logger.Info("No pods found for datacenter", "CassandraDatacenter", cassdcKey)
return ctrl.Result{Requeue: true}, nil
}

// If there is anything in progress, simply requeue the request until each pod has finished or errored
if len(backupJob.Status.InProgress) > 0 {
Expand Down Expand Up @@ -240,6 +245,11 @@ func (r *MedusaBackupJobReconciler) getBackupSummary(ctx context.Context, backup
return nil, err
} else {
for _, remoteBackup := range remoteBackups {
if remoteBackup == nil {
err := fmt.Errorf("backup %s summary is nil", backup.Name)
logger.Error(err, "remote backup is nil")
return nil, err
}
logger.Info("found backup", "CassandraPod", pod.Name, "Backup", remoteBackup.BackupName)
if backup.ObjectMeta.Name == remoteBackup.BackupName {
return remoteBackup, nil
Expand All @@ -248,7 +258,7 @@ func (r *MedusaBackupJobReconciler) getBackupSummary(ctx context.Context, backup
}
}
}
return nil, nil
return nil, reconcile.TerminalError(fmt.Errorf("backup summary couldn't be found"))
}

func (r *MedusaBackupJobReconciler) createMedusaBackup(ctx context.Context, backup *medusav1alpha1.MedusaBackupJob, backupSummary *medusa.BackupSummary, logger logr.Logger) error {
Expand All @@ -257,7 +267,7 @@ func (r *MedusaBackupJobReconciler) createMedusaBackup(ctx context.Context, back
backupKey := types.NamespacedName{Namespace: backup.ObjectMeta.Namespace, Name: backup.Name}
backupResource := &medusav1alpha1.MedusaBackup{}
if err := r.Get(ctx, backupKey, backupResource); err != nil {
if errors.IsNotFound(err) {
if apiErrors.IsNotFound(err) {
// Backup doesn't exist, create it
startTime := backup.Status.StartTime
finishTime := metav1.Now()
Expand Down Expand Up @@ -331,7 +341,7 @@ func backupStatus(ctx context.Context, name string, pod *corev1.Pod, clientFacto
resp, err := medusaClient.BackupStatus(ctx, name)
if err != nil {
// the gRPC client does not return proper NotFound error, we need to check the payload too
if errors.IsNotFound(err) || strings.Contains(err.Error(), "NotFound") {
if apiErrors.IsNotFound(err) || strings.Contains(err.Error(), "NotFound") {
logger.Info(fmt.Sprintf("did not find backup %s for pod %s", name, pod.Name))
return medusa.StatusType_UNKNOWN, nil
}
Expand Down
170 changes: 117 additions & 53 deletions controllers/medusa/medusabackupjob_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ import (
"strings"
"sync"
"testing"
"time"

cassdcapi "github.com/k8ssandra/cass-operator/apis/cassandra/v1beta1"
k8ss "github.com/k8ssandra/k8ssandra-operator/apis/k8ssandra/v1alpha1"
Expand All @@ -31,6 +32,8 @@ const (
successfulBackupName = "good-backup"
failingBackupName = "bad-backup"
missingBackupName = "missing-backup"
backupWithNoPods = "backup-with-no-pods"
backupWithNilSummary = "backup-with-nil-summary"
dc1PodPrefix = "192.168.1."
dc2PodPrefix = "192.168.2."
fakeBackupFileCount = int64(13)
Expand Down Expand Up @@ -167,6 +170,13 @@ func testMedusaBackupDatacenter(t *testing.T, ctx context.Context, f *framework.
backupCreated = createAndVerifyMedusaBackup(dc1Key, dc1, f, ctx, require, t, namespace, missingBackupName)
require.False(backupCreated, "the backup object shouldn't have been created")

// in K8OP-294 we found out we can try to make backups on StatefulSets with no pods
backupCreated = createAndVerifyMedusaBackup(dc1Key, dc1, f, ctx, require, t, namespace, backupWithNoPods)
require.False(backupCreated, "the backup object shouldn't have been created")
// in that same effort, we also found out we can have nil instead of backup sumamry
backupCreated = createAndVerifyMedusaBackup(dc1Key, dc1, f, ctx, require, t, namespace, backupWithNilSummary)
require.False(backupCreated, "the backup object shouldn't have been created")

err = f.DeleteK8ssandraCluster(ctx, client.ObjectKey{Namespace: kc.Namespace, Name: kc.Name}, timeout, interval)
require.NoError(err, "failed to delete K8ssandraCluster")
verifyObjectDoesNotExist(ctx, t, f, dc1Key, &cassdcapi.CassandraDatacenter{})
Expand Down Expand Up @@ -210,6 +220,17 @@ func createAndVerifyMedusaBackup(dcKey framework.ClusterKey, dc *cassdcapi.Cassa

createDatacenterPods(t, f, ctx, dcKeyCopy, dcCopy)

// one test scenario needs to have no pods available in the STSs (see #1454)
// we reproduce that by deleting the pods. we need this for the medusa backup controller tests to work
// however, we need to bring them back up because medusa task controller tests interact with them later
// both backup and task controller tests use this function to verify backups
if backupName == backupWithNoPods {
deleteDatacenterPods(t, f, ctx, dcKey, dc)
deleteDatacenterPods(t, f, ctx, dcKeyCopy, dc)
defer createDatacenterPods(t, f, ctx, dcKey, dc)
defer createDatacenterPods(t, f, ctx, dcKeyCopy, dcCopy)
}

t.Log("creating MedusaBackupJob")
backupKey := framework.NewClusterKey(dcKey.K8sContext, dcKey.Namespace, backupName)
backup := &api.MedusaBackupJob{
Expand All @@ -225,39 +246,25 @@ func createAndVerifyMedusaBackup(dcKey framework.ClusterKey, dc *cassdcapi.Cassa
err := f.Create(ctx, backupKey, backup)
require.NoError(err, "failed to create MedusaBackupJob")

t.Log("verify that the backups are started")
require.Eventually(func() bool {
t.Logf("Requested backups: %v", medusaClientFactory.GetRequestedBackups(dc.DatacenterName()))
updated := &api.MedusaBackupJob{}
err := f.Get(ctx, backupKey, updated)
if err != nil {
t.Logf("failed to get MedusaBackupJob: %v", err)
return false
if backupName != backupWithNoPods {
t.Log("verify that the backups start eventually")
verifyBackupJobStarted(require.Eventually, t, dc, f, ctx, backupKey)
if backupName != backupWithNilSummary {
// backup that will receive a nil summary will never finish, can't assert on that
verifyBackupJobFinished(t, require, dc, f, ctx, backupKey, backupName)
}
return !updated.Status.StartTime.IsZero()
}, timeout, interval)

t.Log("verify the backup finished")
require.Eventually(func() bool {
t.Logf("Requested backups: %v", medusaClientFactory.GetRequestedBackups(dc.DatacenterName()))
updated := &api.MedusaBackupJob{}
err := f.Get(ctx, backupKey, updated)
if err != nil {
return false
}
t.Logf("backup %s finish time: %v", backupName, updated.Status.FinishTime)
t.Logf("backup %s failed: %v", backupName, updated.Status.Failed)
t.Logf("backup %s finished: %v", backupName, updated.Status.Finished)
t.Logf("backup %s in progress: %v", backupName, updated.Status.InProgress)
return !updated.Status.FinishTime.IsZero()
}, timeout, interval)
} else {
t.Log("verify that the backups never start")
verifyBackupJobStarted(require.Never, t, dc, f, ctx, backupKey)
}

t.Log("check for the MedusaBackup being created")
medusaBackupKey := framework.NewClusterKey(dcKey.K8sContext, dcKey.Namespace, backupName)
medusaBackup := &api.MedusaBackup{}
err = f.Get(ctx, medusaBackupKey, medusaBackup)
if err != nil {
if errors.IsNotFound(err) {
// exit the test if the backup was not created. this happens for some backup names on purpose
return false
}
}
Expand All @@ -274,6 +281,43 @@ func createAndVerifyMedusaBackup(dcKey framework.ClusterKey, dc *cassdcapi.Cassa
return true
}

func verifyBackupJobFinished(t *testing.T, require *require.Assertions, dc *cassdcapi.CassandraDatacenter, f *framework.Framework, ctx context.Context, backupKey framework.ClusterKey, backupName string) {
t.Log("verify the backup finished")
require.Eventually(func() bool {
t.Logf("Requested backups: %v", medusaClientFactory.GetRequestedBackups(dc.DatacenterName()))
updated := &api.MedusaBackupJob{}
err := f.Get(ctx, backupKey, updated)
if err != nil {
return false
}
t.Logf("backup %s finish time: %v", backupName, updated.Status.FinishTime)
t.Logf("backup %s failed: %v", backupName, updated.Status.Failed)
t.Logf("backup %s finished: %v", backupName, updated.Status.Finished)
t.Logf("backup %s in progress: %v", backupName, updated.Status.InProgress)
return !updated.Status.FinishTime.IsZero()
}, timeout, interval)
}

func verifyBackupJobStarted(
verifyFunction func(condition func() bool, waitFor time.Duration, tick time.Duration, msgAndArgs ...interface{}),
t *testing.T,
dc *cassdcapi.CassandraDatacenter,
f *framework.Framework,
ctx context.Context,
backupKey framework.ClusterKey,
) {
verifyFunction(func() bool {
t.Logf("Requested backups: %v", medusaClientFactory.GetRequestedBackups(dc.DatacenterName()))
updated := &api.MedusaBackupJob{}
err := f.Get(ctx, backupKey, updated)
if err != nil {
t.Logf("failed to get MedusaBackupJob: %v", err)
return false
}
return !updated.Status.StartTime.IsZero()
}, timeout, interval)
}

func reconcileReplicatedSecret(ctx context.Context, t *testing.T, f *framework.Framework, kc *k8ss.K8ssandraCluster) {
t.Log("check ReplicatedSecret reconciled")

Expand Down Expand Up @@ -396,35 +440,40 @@ func (c *fakeMedusaClient) GetBackups(ctx context.Context) ([]*medusa.BackupSumm
status = *medusa.StatusType_IN_PROGRESS.Enum()
}

backup := &medusa.BackupSummary{
BackupName: name,
StartTime: 0,
FinishTime: 10,
TotalNodes: 3,
FinishedNodes: 3,
TotalObjects: fakeBackupFileCount,
TotalSize: fakeBackupByteSize,
Status: status,
Nodes: []*medusa.BackupNode{
{
Host: "host1",
Tokens: []int64{1, 2, 3},
Datacenter: "dc1",
Rack: "rack1",
},
{
Host: "host2",
Tokens: []int64{1, 2, 3},
Datacenter: "dc1",
Rack: "rack1",
},
{
Host: "host3",
Tokens: []int64{1, 2, 3},
Datacenter: "dc1",
Rack: "rack1",
var backup *medusa.BackupSummary
if strings.HasPrefix(name, backupWithNilSummary) {
backup = nil
} else {
backup = &medusa.BackupSummary{
BackupName: name,
StartTime: 0,
FinishTime: 10,
TotalNodes: 3,
FinishedNodes: 3,
TotalObjects: fakeBackupFileCount,
TotalSize: fakeBackupByteSize,
Status: status,
Nodes: []*medusa.BackupNode{
{
Host: "host1",
Tokens: []int64{1, 2, 3},
Datacenter: "dc1",
Rack: "rack1",
},
{
Host: "host2",
Tokens: []int64{1, 2, 3},
Datacenter: "dc1",
Rack: "rack1",
},
{
Host: "host3",
Tokens: []int64{1, 2, 3},
Datacenter: "dc1",
Rack: "rack1",
},
},
},
}
}
backups = append(backups, backup)
}
Expand Down Expand Up @@ -488,6 +537,21 @@ func findDatacenterCondition(status *cassdcapi.CassandraDatacenterStatus, condTy
return nil
}

func deleteDatacenterPods(t *testing.T, f *framework.Framework, ctx context.Context, dcKey framework.ClusterKey, dc *cassdcapi.CassandraDatacenter) {
for i := 0; i < int(dc.Spec.Size); i++ {

podName := fmt.Sprintf("%s-%s-%d", dc.Spec.ClusterName, dc.DatacenterName(), i)
podKey := framework.NewClusterKey(dcKey.K8sContext, dcKey.Namespace, podName)
pod := &corev1.Pod{
ObjectMeta: metav1.ObjectMeta{Name: podKey.Name, Namespace: podKey.Namespace},
}
err := f.Delete(ctx, podKey, pod)
if err != nil {
t.Logf("failed to delete pod %s: %v", podKey, err)
}
}
}

func createDatacenterPods(t *testing.T, f *framework.Framework, ctx context.Context, dcKey framework.ClusterKey, dc *cassdcapi.CassandraDatacenter) {
_ = f.CreateNamespace(dcKey.Namespace)
for i := int32(0); i < dc.Spec.Size; i++ {
Expand Down
9 changes: 5 additions & 4 deletions pkg/medusa/utils.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ package medusa

import (
"context"
"errors"

"github.com/go-logr/logr"
corev1 "k8s.io/api/core/v1"
Expand All @@ -21,8 +22,8 @@ func GetCassandraDatacenterPods(ctx context.Context, cassdc *cassdcapi.Cassandra
return nil, err
}

pods := make([]corev1.Pod, 0)
pods = append(pods, podList.Items...)

return pods, nil
if podList.Items != nil {
return podList.Items, nil
}
return nil, errors.New("podList came with nil Items field")
}

0 comments on commit a5338e2

Please sign in to comment.