Add retries around kubectl wait

Signed-off-by: Rahul Ganesh <[email protected]>
aws · Dec 11, 2024 · a951ea5 · a951ea5
1 parent a77f775
commit a951ea5
Show file tree

Hide file tree

Showing 2 changed files with 76 additions and 4 deletions.
diff --git a/pkg/clustermanager/cluster_manager.go b/pkg/clustermanager/cluster_manager.go
@@ -61,6 +61,7 @@ const (
 var (
 	clusterctlNetworkErrorRegex              = regexp.MustCompile(`.*failed to connect to the management cluster:.*`)
 	clusterctlMoveProvisionedInfraErrorRegex = regexp.MustCompile(`.*failed to check for provisioned infrastructure*`)
+	kubectlResourceNotFoundRegex             = regexp.MustCompile(`.*the server doesn't have a resource type "(.*)".*`)
 	eksaClusterResourceType                  = fmt.Sprintf("clusters.%s", v1alpha1.GroupVersion.Group)
 )
 
@@ -211,20 +212,32 @@ func WithNoTimeouts() ClusterManagerOpt {
 func clusterctlMoveWaitForInfrastructureRetryPolicy(totalRetries int, err error) (retry bool, wait time.Duration) {
 	// Retry both network and cluster move errors.
 	if match := (clusterctlNetworkErrorRegex.MatchString(err.Error()) || clusterctlMoveProvisionedInfraErrorRegex.MatchString(err.Error())); match {
-		return true, clusterctlMoveRetryWaitTime(totalRetries)
+		return true, exponentialRetryWaitTime(totalRetries)
 	}
 	return false, 0
 }
 
 func clusterctlMoveRetryPolicy(totalRetries int, err error) (retry bool, wait time.Duration) {
 	// Retry only network errors.
 	if match := clusterctlNetworkErrorRegex.MatchString(err.Error()); match {
-		return true, clusterctlMoveRetryWaitTime(totalRetries)
+		return true, exponentialRetryWaitTime(totalRetries)
 	}
 	return false, 0
 }
 
-func clusterctlMoveRetryWaitTime(totalRetries int) time.Duration {
+func kubectlWaitRetryPolicy(totalRetries int, err error) (retry bool, wait time.Duration) {
+	// Sometimes it is possible that the clusterctl move is successful,
+	// but the clusters.cluster.x-k8s.io resource is not available on the cluster yet.
+	//
+	// Retry on transient 'server doesn't have a resource type' errors.
+	// Use existing exponential backoff implementation for retry on these errors.
+	if match := kubectlResourceNotFoundRegex.MatchString(err.Error()); match {
+		return true, exponentialRetryWaitTime(totalRetries)
+	}
+	return false, 0
+}
+
+func exponentialRetryWaitTime(totalRetries int) time.Duration {
 	// Exponential backoff on errors.  Retrier built-in backoff is linear, so implementing here.
 
 	// Retrier first calls the policy before retry #1.  We want it zero-based for exponentiation.
@@ -297,7 +310,10 @@ func (c *ClusterManager) MoveCAPI(ctx context.Context, from, to *types.Cluster,
 	}
 
 	logger.V(3).Info("Waiting for workload cluster control plane to be ready after move")
-	err = c.clusterClient.WaitForControlPlaneReady(ctx, to, c.controlPlaneWaitAfterMoveTimeout.String(), clusterName)
+	r = retrier.New(c.clusterctlMoveTimeout, retrier.WithRetryPolicy(kubectlWaitRetryPolicy))
+	err = r.Retry(func() error {
+		return c.clusterClient.WaitForControlPlaneReady(ctx, to, c.controlPlaneWaitAfterMoveTimeout.String(), clusterName)
+	})
 	if err != nil {
 		return err
 	}

diff --git a/pkg/clustermanager/cluster_manager_test.go b/pkg/clustermanager/cluster_manager_test.go
@@ -532,6 +532,62 @@ func TestClusterManagerMoveCAPIRetrySuccess(t *testing.T) {
 	}
 }
 
+func TestClusterManagerMoveKubectlWaitRetrySuccess(t *testing.T) {
+	from := &types.Cluster{
+		Name: "from-cluster",
+	}
+	to := &types.Cluster{
+		Name: "to-cluster",
+	}
+	clusterSpec := test.NewClusterSpec(func(s *cluster.Spec) {
+		s.Cluster.Name = to.Name
+		s.Cluster.Spec.ControlPlaneConfiguration.Count = 3
+		s.Cluster.Spec.WorkerNodeGroupConfigurations = []v1alpha1.WorkerNodeGroupConfiguration{{Count: ptr.Int(3), MachineGroupRef: &v1alpha1.Ref{Name: "test-wn"}}}
+	})
+	ctx := context.Background()
+
+	c, m := newClusterManager(t)
+	kcp, mds := getKcpAndMdsForNodeCount(0)
+	m.client.EXPECT().GetKubeadmControlPlane(ctx,
+		from,
+		to.Name,
+		gomock.AssignableToTypeOf(executables.WithCluster(from)),
+		gomock.AssignableToTypeOf(executables.WithNamespace(constants.EksaSystemNamespace)),
+	).Return(kcp, nil)
+	m.client.EXPECT().GetMachineDeploymentsForCluster(ctx,
+		to.Name,
+		gomock.AssignableToTypeOf(executables.WithCluster(from)),
+		gomock.AssignableToTypeOf(executables.WithNamespace(constants.EksaSystemNamespace)),
+	).Return(mds, nil)
+	m.client.EXPECT().GetMachines(ctx, from, to.Name)
+	m.client.EXPECT().WaitForClusterReady(ctx, from, "1h0m0s", to.Name)
+	m.client.EXPECT().MoveManagement(ctx, from, to, to.Name)
+	firstTry := m.client.EXPECT().WaitForControlPlaneReady(ctx, to, "15m0s", to.Name).Return(errors.New("executing wait: error: the server doesn't have a resource type \"clusters\""))
+	secondTry := m.client.EXPECT().WaitForControlPlaneReady(ctx, to, "15m0s", to.Name).Return(nil)
+	gomock.InOrder(
+		firstTry,
+		secondTry,
+	)
+	m.client.EXPECT().ValidateControlPlaneNodes(ctx, to, to.Name)
+	m.client.EXPECT().CountMachineDeploymentReplicasReady(ctx, to.Name, to.KubeconfigFile)
+	m.client.EXPECT().GetKubeadmControlPlane(ctx,
+		to,
+		to.Name,
+		gomock.AssignableToTypeOf(executables.WithCluster(from)),
+		gomock.AssignableToTypeOf(executables.WithNamespace(constants.EksaSystemNamespace)),
+	).Return(kcp, nil)
+	m.client.EXPECT().GetMachineDeploymentsForCluster(ctx,
+		to.Name,
+		gomock.AssignableToTypeOf(executables.WithCluster(from)),
+		gomock.AssignableToTypeOf(executables.WithNamespace(constants.EksaSystemNamespace)),
+	).Return(mds, nil)
+	m.client.EXPECT().GetMachines(ctx, to, to.Name)
+
+	if err := c.MoveCAPI(ctx, from, to, to.Name, clusterSpec); err != nil {
+		t.Errorf("ClusterManager.MoveCAPI() error = %v, wantErr nil", err)
+	}
+}
+
 func TestClusterManagerMoveCAPIErrorMove(t *testing.T) {
 	from := &types.Cluster{
 		Name: "from-cluster",