From 768dbb868ec9b6c4dfad400e938d8f6566050691 Mon Sep 17 00:00:00 2001 From: "opensearch-trigger-bot[bot]" <98922864+opensearch-trigger-bot[bot]@users.noreply.github.com> Date: Tue, 18 Jun 2024 16:04:10 -0400 Subject: [PATCH] Fix flaky test RecoveryFromGatewayIT.testMultipleReplicaShardAssignmentWithDelayedAllocationAndDifferentNodeStartTimeInBatchMode (#14424) (#14432) (cherry picked from commit 802f2e6e4b21f27ddc6c01e7fc6f6cdcd69138d3) Signed-off-by: Swetha Guptha Signed-off-by: github-actions[bot] Co-authored-by: github-actions[bot] Co-authored-by: Swetha Guptha --- .../gateway/RecoveryFromGatewayIT.java | 44 ++++++++----------- 1 file changed, 19 insertions(+), 25 deletions(-) diff --git a/server/src/internalClusterTest/java/org/opensearch/gateway/RecoveryFromGatewayIT.java b/server/src/internalClusterTest/java/org/opensearch/gateway/RecoveryFromGatewayIT.java index fc0a574c191b1..6296608c64d37 100644 --- a/server/src/internalClusterTest/java/org/opensearch/gateway/RecoveryFromGatewayIT.java +++ b/server/src/internalClusterTest/java/org/opensearch/gateway/RecoveryFromGatewayIT.java @@ -34,7 +34,6 @@ import org.apache.lucene.index.CorruptIndexException; import org.opensearch.Version; -import org.opensearch.action.admin.cluster.allocation.ClusterAllocationExplainResponse; import org.opensearch.action.admin.cluster.configuration.AddVotingConfigExclusionsAction; import org.opensearch.action.admin.cluster.configuration.AddVotingConfigExclusionsRequest; import org.opensearch.action.admin.cluster.configuration.ClearVotingConfigExclusionsAction; @@ -101,6 +100,8 @@ import java.util.Map; import java.util.Set; import java.util.concurrent.ExecutionException; +import java.util.concurrent.TimeUnit; +import java.util.function.BooleanSupplier; import java.util.stream.Collectors; import java.util.stream.IntStream; @@ -883,17 +884,20 @@ public void testMultipleReplicaShardAssignmentWithDelayedAllocationAndDifferentN assertEquals(YELLOW, health.getStatus()); assertEquals(2, health.getUnassignedShards()); // shard should be unassigned because of Allocation_Delayed - ClusterAllocationExplainResponse allocationExplainResponse = client().admin() - .cluster() - .prepareAllocationExplain() - .setIndex("test") - .setShard(0) - .setPrimary(false) - .get(); - assertEquals( - AllocationDecision.ALLOCATION_DELAYED, - allocationExplainResponse.getExplanation().getShardAllocationDecision().getAllocateDecision().getAllocationDecision() + BooleanSupplier delayedShardAllocationStatusVerificationSupplier = () -> AllocationDecision.ALLOCATION_DELAYED.equals( + client().admin() + .cluster() + .prepareAllocationExplain() + .setIndex("test") + .setShard(0) + .setPrimary(false) + .get() + .getExplanation() + .getShardAllocationDecision() + .getAllocateDecision() + .getAllocationDecision() ); + waitUntil(delayedShardAllocationStatusVerificationSupplier, 2, TimeUnit.MINUTES); logger.info("--> restarting the node 1"); internalCluster().startDataOnlyNode( @@ -903,26 +907,16 @@ public void testMultipleReplicaShardAssignmentWithDelayedAllocationAndDifferentN assertTrue(clusterRerouteResponse.isAcknowledged()); ensureStableCluster(6); waitUntil( - () -> client().admin().cluster().health(Requests.clusterHealthRequest().timeout("5m")).actionGet().getInitializingShards() == 0 + () -> client().admin().cluster().health(Requests.clusterHealthRequest().timeout("5m")).actionGet().getActiveShards() == 3, + 2, + TimeUnit.MINUTES ); - health = client().admin().cluster().health(Requests.clusterHealthRequest().timeout("5m")).actionGet(); assertFalse(health.isTimedOut()); assertEquals(YELLOW, health.getStatus()); assertEquals(1, health.getUnassignedShards()); assertEquals(1, health.getDelayedUnassignedShards()); - allocationExplainResponse = client().admin() - .cluster() - .prepareAllocationExplain() - .setIndex("test") - .setShard(0) - .setPrimary(false) - .get(); - assertEquals( - AllocationDecision.ALLOCATION_DELAYED, - allocationExplainResponse.getExplanation().getShardAllocationDecision().getAllocateDecision().getAllocationDecision() - ); - + waitUntil(delayedShardAllocationStatusVerificationSupplier, 2, TimeUnit.MINUTES); logger.info("--> restarting the node 0"); internalCluster().startDataOnlyNode( Settings.builder().put("node.name", nodesWithReplicaShards.get(1)).put(replicaNode1DataPathSettings).build()