diff --git a/pkg/clustermanager/cluster_manager.go b/pkg/clustermanager/cluster_manager.go index a0ec614c7b5d..99d0a8b7ca57 100644 --- a/pkg/clustermanager/cluster_manager.go +++ b/pkg/clustermanager/cluster_manager.go @@ -61,6 +61,7 @@ const ( var ( clusterctlNetworkErrorRegex = regexp.MustCompile(`.*failed to connect to the management cluster:.*`) clusterctlMoveProvisionedInfraErrorRegex = regexp.MustCompile(`.*failed to check for provisioned infrastructure*`) + kubectlResourceNotFoundRegex = regexp.MustCompile(`.*the server doesn't have a resource type "(.*)".*`) eksaClusterResourceType = fmt.Sprintf("clusters.%s", v1alpha1.GroupVersion.Group) ) @@ -221,7 +222,7 @@ func WithNoTimeouts() ClusterManagerOpt { func clusterctlMoveWaitForInfrastructureRetryPolicy(totalRetries int, err error) (retry bool, wait time.Duration) { // Retry both network and cluster move errors. if match := (clusterctlNetworkErrorRegex.MatchString(err.Error()) || clusterctlMoveProvisionedInfraErrorRegex.MatchString(err.Error())); match { - return true, clusterctlMoveRetryWaitTime(totalRetries) + return true, exponentialRetryWaitTime(totalRetries) } return false, 0 } @@ -229,12 +230,24 @@ func clusterctlMoveWaitForInfrastructureRetryPolicy(totalRetries int, err error) func clusterctlMoveRetryPolicy(totalRetries int, err error) (retry bool, wait time.Duration) { // Retry only network errors. if match := clusterctlNetworkErrorRegex.MatchString(err.Error()); match { - return true, clusterctlMoveRetryWaitTime(totalRetries) + return true, exponentialRetryWaitTime(totalRetries) } return false, 0 } -func clusterctlMoveRetryWaitTime(totalRetries int) time.Duration { +func kubectlWaitRetryPolicy(totalRetries int, err error) (retry bool, wait time.Duration) { + // Sometimes it is possible that the clusterctl move is successful, + // but the clusters.cluster.x-k8s.io resource is not available on the cluster yet. + // + // Retry on transient 'server doesn't have a resource type' errors. + // Use existing exponential backoff implementation for retry on these errors. + if match := kubectlResourceNotFoundRegex.MatchString(err.Error()); match { + return true, exponentialRetryWaitTime(totalRetries) + } + return false, 0 +} + +func exponentialRetryWaitTime(totalRetries int) time.Duration { // Exponential backoff on errors. Retrier built-in backoff is linear, so implementing here. // Retrier first calls the policy before retry #1. We want it zero-based for exponentiation. @@ -307,7 +320,10 @@ func (c *ClusterManager) MoveCAPI(ctx context.Context, from, to *types.Cluster, } logger.V(3).Info("Waiting for workload cluster control plane to be ready after move") - err = c.clusterClient.WaitForControlPlaneReady(ctx, to, c.controlPlaneWaitAfterMoveTimeout.String(), clusterName) + r = retrier.New(c.clusterctlMoveTimeout, retrier.WithRetryPolicy(kubectlWaitRetryPolicy)) + err = r.Retry(func() error { + return c.clusterClient.WaitForControlPlaneReady(ctx, to, c.controlPlaneWaitAfterMoveTimeout.String(), clusterName) + }) if err != nil { return err } diff --git a/pkg/clustermanager/cluster_manager_test.go b/pkg/clustermanager/cluster_manager_test.go index f2c00e0863ef..b25f5b0b4834 100644 --- a/pkg/clustermanager/cluster_manager_test.go +++ b/pkg/clustermanager/cluster_manager_test.go @@ -532,6 +532,62 @@ func TestClusterManagerMoveCAPIRetrySuccess(t *testing.T) { } } +func TestClusterManagerMoveKubectlWaitRetrySuccess(t *testing.T) { + from := &types.Cluster{ + Name: "from-cluster", + } + to := &types.Cluster{ + Name: "to-cluster", + } + clusterSpec := test.NewClusterSpec(func(s *cluster.Spec) { + s.Cluster.Name = to.Name + s.Cluster.Spec.ControlPlaneConfiguration.Count = 3 + s.Cluster.Spec.WorkerNodeGroupConfigurations = []v1alpha1.WorkerNodeGroupConfiguration{{Count: ptr.Int(3), MachineGroupRef: &v1alpha1.Ref{Name: "test-wn"}}} + }) + ctx := context.Background() + + c, m := newClusterManager(t) + kcp, mds := getKcpAndMdsForNodeCount(0) + m.client.EXPECT().GetKubeadmControlPlane(ctx, + from, + to.Name, + gomock.AssignableToTypeOf(executables.WithCluster(from)), + gomock.AssignableToTypeOf(executables.WithNamespace(constants.EksaSystemNamespace)), + ).Return(kcp, nil) + m.client.EXPECT().GetMachineDeploymentsForCluster(ctx, + to.Name, + gomock.AssignableToTypeOf(executables.WithCluster(from)), + gomock.AssignableToTypeOf(executables.WithNamespace(constants.EksaSystemNamespace)), + ).Return(mds, nil) + m.client.EXPECT().GetMachines(ctx, from, to.Name) + m.client.EXPECT().WaitForClusterReady(ctx, from, "1h0m0s", to.Name) + m.client.EXPECT().MoveManagement(ctx, from, to, to.Name) + firstTry := m.client.EXPECT().WaitForControlPlaneReady(ctx, to, "15m0s", to.Name).Return(errors.New("executing wait: error: the server doesn't have a resource type \"clusters\"")) + secondTry := m.client.EXPECT().WaitForControlPlaneReady(ctx, to, "15m0s", to.Name).Return(nil) + gomock.InOrder( + firstTry, + secondTry, + ) + m.client.EXPECT().ValidateControlPlaneNodes(ctx, to, to.Name) + m.client.EXPECT().CountMachineDeploymentReplicasReady(ctx, to.Name, to.KubeconfigFile) + m.client.EXPECT().GetKubeadmControlPlane(ctx, + to, + to.Name, + gomock.AssignableToTypeOf(executables.WithCluster(from)), + gomock.AssignableToTypeOf(executables.WithNamespace(constants.EksaSystemNamespace)), + ).Return(kcp, nil) + m.client.EXPECT().GetMachineDeploymentsForCluster(ctx, + to.Name, + gomock.AssignableToTypeOf(executables.WithCluster(from)), + gomock.AssignableToTypeOf(executables.WithNamespace(constants.EksaSystemNamespace)), + ).Return(mds, nil) + m.client.EXPECT().GetMachines(ctx, to, to.Name) + + if err := c.MoveCAPI(ctx, from, to, to.Name, clusterSpec); err != nil { + t.Errorf("ClusterManager.MoveCAPI() error = %v, wantErr nil", err) + } +} + func TestClusterManagerMoveCAPIErrorMove(t *testing.T) { from := &types.Cluster{ Name: "from-cluster",