From f12e63e3a7169c98f36d7a2d36c38fd803dd702a Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Thu, 11 Apr 2024 10:12:52 -0700 Subject: [PATCH 01/11] Bump tar from 6.1.11 to 6.2.1 in /helix-front (#2789) Bumps [tar](https://github.com/isaacs/node-tar) from 6.1.11 to 6.2.1. - [Release notes](https://github.com/isaacs/node-tar/releases) - [Changelog](https://github.com/isaacs/node-tar/blob/main/CHANGELOG.md) - [Commits](https://github.com/isaacs/node-tar/compare/v6.1.11...v6.2.1) --- updated-dependencies: - dependency-name: tar dependency-type: indirect ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- helix-front/yarn.lock | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/helix-front/yarn.lock b/helix-front/yarn.lock index 57efb41424..4f31486519 100644 --- a/helix-front/yarn.lock +++ b/helix-front/yarn.lock @@ -11567,6 +11567,11 @@ minipass@^3.0.0, minipass@^3.1.0, minipass@^3.1.1, minipass@^3.1.3, minipass@^3. dependencies: yallist "^4.0.0" +minipass@^5.0.0: + version "5.0.0" + resolved "https://registry.yarnpkg.com/minipass/-/minipass-5.0.0.tgz#3e9788ffb90b694a5d0ec94479a45b5d8738133d" + integrity sha512-3FnjYuehv9k6ovOEbyOswadCDPX1piCfhV8ncmYtHOjuPwylVWsghTLo7rabjC3Rx5xD4HDx8Wm1xnMF7S5qFQ== + minizlib@^2.0.0, minizlib@^2.1.1, minizlib@^2.1.2: version "2.1.2" resolved "https://registry.yarnpkg.com/minizlib/-/minizlib-2.1.2.tgz#e90d3466ba209b932451508a11ce3d3632145931" @@ -14895,13 +14900,13 @@ tapable@^2.0.0, tapable@^2.1.1, tapable@^2.2.0: integrity sha512-GNzQvQTOIP6RyTfE2Qxb8ZVlNmw0n88vp1szwWRimP02mnTsx3Wtn5qRdqY9w2XduFNUgvOwhNnQsjwCp+kqaQ== tar@^6.0.2, tar@^6.1.0, tar@^6.1.11, tar@^6.1.2: - version "6.1.11" - resolved "https://registry.yarnpkg.com/tar/-/tar-6.1.11.tgz#6760a38f003afa1b2ffd0ffe9e9abbd0eab3d621" - integrity sha512-an/KZQzQUkZCkuoAA64hM92X0Urb6VpRhAFllDzz44U2mcD5scmT3zBc4VgVpkugF580+DQn8eAFSyoQt0tznA== + version "6.2.1" + resolved "https://registry.yarnpkg.com/tar/-/tar-6.2.1.tgz#717549c541bc3c2af15751bea94b1dd068d4b03a" + integrity sha512-DZ4yORTwrbTj/7MZYq2w+/ZFdI6OZ/f9SFHR+71gIVUZhOQPHzVCLpvRnPgyaMpfWxxk/4ONva3GQSyNIKRv6A== dependencies: chownr "^2.0.0" fs-minipass "^2.0.0" - minipass "^3.0.0" + minipass "^5.0.0" minizlib "^2.1.1" mkdirp "^1.0.3" yallist "^4.0.0" From 9e25ab5772e523406196a5c6da531ac557f4d1ef Mon Sep 17 00:00:00 2001 From: Zachary Pinto Date: Thu, 25 Apr 2024 15:47:03 -0700 Subject: [PATCH 02/11] Replace the HELIX_ENABLED config with InstanceOperation while maintaining backwards compatibility with old APIs (#2772) Replace the HELIX_ENABLED config with InstanceOperation while maintaining backwards compatibility with old APIs. In order to unify HELIX_ENABLED functionality with InstanceOperation, InstanceOperation will now have the following options: ENABLE, DISABLE, EVACUATE, SWAP_IN, UNKNOWN --- .../helix/constants/InstanceConstants.java | 57 +- .../java/org/apache/helix/HelixAdmin.java | 8 +- .../BaseControllerDataProvider.java | 180 ++---- .../rebalancer/AbstractRebalancer.java | 9 +- .../rebalancer/DelayedAutoRebalancer.java | 12 +- .../rebalancer/topology/Topology.java | 2 +- .../rebalancer/util/DelayedRebalanceUtil.java | 9 +- .../rebalancer/waged/WagedRebalancer.java | 8 +- .../waged/model/ClusterModelProvider.java | 2 +- .../controller/stages/AttributeName.java | 1 + .../stages/BestPossibleStateCalcStage.java | 205 ++++--- .../stages/CurrentStateComputationStage.java | 25 +- .../stages/IntermediateStateCalcStage.java | 4 +- .../stages/MaintenanceRecoveryStage.java | 3 +- .../stages/MessageGenerationPhase.java | 12 + .../stages/ReadClusterDataStage.java | 2 +- .../examples/IdealStateBuilderExample.java | 3 +- .../helix/examples/IdealStateExample.java | 3 +- .../org/apache/helix/examples/Quickstart.java | 5 +- .../apache/helix/manager/zk/ZKHelixAdmin.java | 455 +++++++------- .../org/apache/helix/model/ClusterConfig.java | 9 + .../apache/helix/model/InstanceConfig.java | 158 ++++- .../apache/helix/model/MaintenanceSignal.java | 2 + .../helix/spectator/RoutingDataCache.java | 10 +- .../org/apache/helix/task/JobDispatcher.java | 4 +- .../org/apache/helix/tools/ClusterSetup.java | 2 +- .../StrictMatchExternalViewVerifier.java | 4 +- .../java/org/apache/helix/util/HelixUtil.java | 2 +- .../helix/util/InstanceValidationUtil.java | 12 +- .../helix/util/WeightAwareRebalanceUtil.java | 3 +- .../org/apache/helix/common/ZkTestBase.java | 4 +- .../TestResourceChangeDetector.java | 5 +- .../rebalancer/TestAbstractRebalancer.java | 2 + .../rebalancer/waged/TestWagedRebalancer.java | 16 +- .../waged/TestWagedRebalancerMetrics.java | 4 +- .../waged/model/AbstractTestClusterModel.java | 3 +- .../waged/model/TestClusterModelProvider.java | 10 +- ...estBestPossibleCalcStageCompatibility.java | 2 + .../TestBestPossibleStateCalcStage.java | 2 + .../TestCancellationMessageGeneration.java | 2 + .../TestIntermediateStateCalcStage.java | 4 + .../TestManagementMessageGeneration.java | 1 + .../stages/TestManagementModeStage.java | 5 +- .../stages/TestReplicaLevelThrottling.java | 3 +- .../stages/TestStateTransitionPriority.java | 4 + .../TestAlertingRebalancerFailure.java | 4 +- .../TestDisableCustomCodeRunner.java | 3 +- .../TestNoThrottleDisabledPartitions.java | 5 +- .../TestClusterMaintenanceMode.java | 4 +- .../messaging/TestMessageThrottle2.java | 3 +- .../CrushRebalancers/TestNodeSwap.java | 3 +- .../PartitionMigration/TestExpandCluster.java | 1 + .../rebalancer/TestInstanceOperation.java | 572 ++++++++---------- .../TestWagedExpandCluster.java | 3 +- .../WagedRebalancer/TestWagedNodeSwap.java | 5 +- .../WagedRebalancer/TestWagedRebalance.java | 5 +- ...TestP2PMessagesAvoidDuplicatedMessage.java | 9 + .../TestP2PStateTransitionMessages.java | 9 + .../TestP2PWithStateCancellationMessage.java | 1 + .../org/apache/helix/mock/MockHelixAdmin.java | 6 +- .../helix/model/TestInstanceConfig.java | 45 +- .../mbeans/TestRebalancerMetrics.java | 3 + .../task/TestTargetedTaskStateChange.java | 4 +- .../helix/util/TestIdealStateAssignment.java | 4 +- .../util/TestInstanceValidationUtil.java | 16 +- ...ctRebalancer.ComputeBestPossibleState.json | 3 +- .../StoppableInstancesSelector.java | 2 +- .../ResourceAssignmentOptimizerAccessor.java | 7 +- .../helix/rest/server/AbstractTestClass.java | 3 +- .../rest/server/TestInstancesAccessor.java | 4 +- .../server/TestPartitionAssignmentAPI.java | 3 +- .../rest/server/TestPerInstanceAccessor.java | 20 +- ...stResourceAssignmentOptimizerAccessor.java | 9 +- .../helix/recipes/rabbitmq/Consumer.java | 3 +- .../apache/helix/filestore/SetupCluster.java | 3 +- .../apache/helix/taskexecution/Worker.java | 3 +- 76 files changed, 1118 insertions(+), 925 deletions(-) diff --git a/helix-common/src/main/java/org/apache/helix/constants/InstanceConstants.java b/helix-common/src/main/java/org/apache/helix/constants/InstanceConstants.java index e2cc2de2d5..07eb4989d3 100644 --- a/helix-common/src/main/java/org/apache/helix/constants/InstanceConstants.java +++ b/helix-common/src/main/java/org/apache/helix/constants/InstanceConstants.java @@ -1,8 +1,32 @@ package org.apache.helix.constants; +import java.util.Set; + public class InstanceConstants { public static final String INSTANCE_NOT_DISABLED = "INSTANCE_NOT_DISABLED"; + /** + * The set contains the InstanceOperations that are allowed to be assigned replicas by the rebalancer. + */ + public static final Set ASSIGNABLE_INSTANCE_OPERATIONS = + Set.of(InstanceOperation.ENABLE, InstanceOperation.DISABLE); + + + /** + * The set contains the InstanceOperations that are overridden when the deprecated HELIX_ENABLED + * field is set to false. This will maintain backwards compatibility with the deprecated field. + * TODO: Remove this when the deprecated HELIX_ENABLED is removed. + */ + public static final Set INSTANCE_DISABLED_OVERRIDABLE_OPERATIONS = + Set.of(InstanceOperation.ENABLE, InstanceOperation.DISABLE, InstanceOperation.EVACUATE); + + + /** + * The set of InstanceOperations that are not allowed to be populated in the RoutingTableProvider. + */ + public static final Set UNSERVABLE_INSTANCE_OPERATIONS = + Set.of(InstanceOperation.SWAP_IN, InstanceOperation.UNKNOWN); + public enum InstanceDisabledType { CLOUD_EVENT, USER_OPERATION, @@ -10,8 +34,35 @@ public enum InstanceDisabledType { } public enum InstanceOperation { - EVACUATE, // Node will be removed after a period of time - SWAP_IN, // New node joining for swap operation - SWAP_OUT // Existing Node to be removed for swap operation + /** + * Behavior: Replicas will be assigned to the node and will receive upward state transitions if + * for new assignments and downward state transitions if replicas are being moved elsewhere. + * Final State: The node will have replicas assigned to it and will be considered for future assignment. + */ + ENABLE, + /** + * Behavior: All replicas on the node will be set to OFFLINE. + * Final State: The node will have all replicas in the OFFLINE state and can't take new assignment. + */ + DISABLE, + /** + * Behavior: All replicas will be moved off the node, after a replacement has been bootstrapped + * in another node in the cluster. + * Final State: The node will not contain any replicas and will not be considered for *NEW* assignment. + */ + EVACUATE, + /** + * Behavior: Node will have all replicas on its corresponding(same logicalId) swap-out node bootstrapped + * (ERROR and OFFLINE replicas on swap-out node will not be bootstrapped) to the same states if the StateModelDef allows. + * This node will be excluded from the RoutingTableProvider. + * Final State: This node will be a mirror the swap-out node, will not be considered for assignment, and will not be populated + * in the RoutingTableProvider. + */ + SWAP_IN, + /** + * Behavior: Node will have all of its replicas dropped immediately and will be removed from the RoutingTableProvider. + * Final State: Node will not hold replicas, be considered for assignment, or be populated in the RoutingTableProvider. + */ + UNKNOWN } } diff --git a/helix-core/src/main/java/org/apache/helix/HelixAdmin.java b/helix-core/src/main/java/org/apache/helix/HelixAdmin.java index 021332be7a..d2e0c2681a 100644 --- a/helix-core/src/main/java/org/apache/helix/HelixAdmin.java +++ b/helix-core/src/main/java/org/apache/helix/HelixAdmin.java @@ -281,9 +281,11 @@ void addResource(String clusterName, String resourceName, int numPartitions, Str * @param instanceName * @param enabled */ + @Deprecated void enableInstance(String clusterName, String instanceName, boolean enabled); /** + * @deprecated use {@link #setInstanceOperation(String, String, InstanceConstants.InstanceOperation)} * @param clusterName * @param instanceName * @param enabled @@ -292,20 +294,24 @@ void addResource(String clusterName, String resourceName, int numPartitions, Str * @param reason set additional string description on why the instance is disabled when * enabled is false. Existing disabled reason will be over write if instance is in disabled state. */ + @Deprecated void enableInstance(String clusterName, String instanceName, boolean enabled, InstanceConstants.InstanceDisabledType disabledType, String reason); /** * Batch enable/disable instances in a cluster * By default, all the instances are enabled + * @deprecated use {@link #setInstanceOperation(String, String, InstanceConstants.InstanceOperation)} * @param clusterName * @param instances * @param enabled */ + @Deprecated void enableInstance(String clusterName, List instances, boolean enabled); /** - * Set the instanceOperation field. + * Set the instanceOperation field. Setting it to null is equivalent to + * ENABLE. * * @param clusterName The cluster name * @param instanceName The instance name diff --git a/helix-core/src/main/java/org/apache/helix/controller/dataproviders/BaseControllerDataProvider.java b/helix-core/src/main/java/org/apache/helix/controller/dataproviders/BaseControllerDataProvider.java index 1e40bbb720..a91ae12d27 100644 --- a/helix-core/src/main/java/org/apache/helix/controller/dataproviders/BaseControllerDataProvider.java +++ b/helix-core/src/main/java/org/apache/helix/controller/dataproviders/BaseControllerDataProvider.java @@ -64,7 +64,6 @@ import org.apache.helix.model.StateModelDefinition; import org.apache.helix.task.TaskConstants; import org.apache.helix.util.HelixUtil; -import org.apache.helix.util.InstanceValidationUtil; import org.apache.helix.zookeeper.datamodel.ZNRecord; import org.apache.helix.zookeeper.zkclient.DataUpdater; import org.slf4j.Logger; @@ -120,30 +119,40 @@ public class BaseControllerDataProvider implements ControlContextProvider { private final Set _disabledInstanceSet = new HashSet<>(); private static final class DerivedInstanceCache { - // Assignable instances are instances will contain at most one instance with a given logicalId. - // This is used for SWAP related operations where there can be two instances with the same logicalId. + private final Map> + _instanceConfigMapByInstanceOperation; private final Map _assignableInstanceConfigMap; private final Map _assignableLiveInstancesMap; private final Map _swapOutInstanceNameToSwapInInstanceName; + private final Map _swapInInstanceNameToSwapOutInstanceName; private final Set _liveSwapInInstanceNames; - private final Set _enabledSwapInInstanceNames; - DerivedInstanceCache(Map assignableInstanceConfigMap, + DerivedInstanceCache( + Map> instanceConfigMapByInstanceOperation, + Map assignableInstanceConfigMap, Map assignableLiveInstancesMap, Map swapOutInstanceNameToSwapInInstanceName, - Set liveSwapInInstanceNames, Set enabledSwapInInstanceNames) { + Set liveSwapInInstanceNames) { + _instanceConfigMapByInstanceOperation = instanceConfigMapByInstanceOperation; _assignableInstanceConfigMap = assignableInstanceConfigMap; _assignableLiveInstancesMap = assignableLiveInstancesMap; _swapOutInstanceNameToSwapInInstanceName = swapOutInstanceNameToSwapInInstanceName; + _swapInInstanceNameToSwapOutInstanceName = swapOutInstanceNameToSwapInInstanceName.entrySet() + .stream().collect(Collectors.toMap(Map.Entry::getValue, Map.Entry::getKey)); _liveSwapInInstanceNames = liveSwapInInstanceNames; - _enabledSwapInInstanceNames = enabledSwapInInstanceNames; + } + + private Map getInstanceConfigMapByInstanceOperation( + InstanceConstants.InstanceOperation instanceOperation) { + return _instanceConfigMapByInstanceOperation.getOrDefault(instanceOperation, + Collections.emptyMap()); } } // All maps and sets are encapsulated in DerivedInstanceCache to ensure that they are updated together // as a snapshot. private DerivedInstanceCache _derivedInstanceCache = - new DerivedInstanceCache(new HashMap<>(), new HashMap<>(), new HashMap<>(), new HashSet<>(), + new DerivedInstanceCache(new HashMap<>(), new HashMap<>(), new HashMap<>(), new HashMap<>(), new HashSet<>()); private final Map _abnormalStateResolverMap = new HashMap<>(); private final Set _timedOutInstanceDuringMaintenance = new HashSet<>(); @@ -383,15 +392,14 @@ private void updateInstanceSets(Map instanceConfigMap, ClusterTopologyConfig.createFromClusterConfig(clusterConfig); // Create new caches to be populated. + Map> + newInstanceConfigMapByInstanceOperation = new HashMap<>(); Map newAssignableInstanceConfigMap = new HashMap<>(); Map newAssignableLiveInstancesMap = new HashMap<>(); - Map newSwapOutInstanceNameToSwapInInstanceName = new HashMap<>(); + Map newSwapOutInstanceNameToSwapOutInstanceName = new HashMap<>(); Set newLiveSwapInInstanceNames = new HashSet<>(); - Set newEnabledSwapInInstanceNames = new HashSet<>(); - - Map filteredInstancesByLogicalId = new HashMap<>(); - Map swapOutLogicalIdsByInstanceName = new HashMap<>(); - Map swapInInstancesByLogicalId = new HashMap<>(); + Map swapInLogicalIdsByInstanceName = new HashMap<>(); + Map nonSwapInInstancesByLogicalId = new HashMap<>(); for (Map.Entry entry : instanceConfigMap.entrySet()) { String node = entry.getKey(); @@ -404,44 +412,20 @@ private void updateInstanceSets(Map instanceConfigMap, String currentInstanceLogicalId = currentInstanceConfig.getLogicalId(clusterTopologyConfig.getEndNodeType()); - // Filter out instances with duplicate logical IDs. If there are duplicates, the instance with - // InstanceOperation SWAP_OUT will be chosen over the instance with SWAP_IN. SWAP_IN is not - // assignable. If there are duplicates with one node having no InstanceOperation and the other - // having SWAP_OUT, the node with no InstanceOperation will be chosen. This signifies SWAP - // completion, therefore making the node assignable. - if (filteredInstancesByLogicalId.containsKey(currentInstanceLogicalId)) { - String filteredNode = filteredInstancesByLogicalId.get(currentInstanceLogicalId); - InstanceConfig filteredDuplicateInstanceConfig = instanceConfigMap.get(filteredNode); - - if ((filteredDuplicateInstanceConfig.getInstanceOperation() - .equals(InstanceConstants.InstanceOperation.SWAP_IN.name()) - && currentInstanceConfig.getInstanceOperation() - .equals(InstanceConstants.InstanceOperation.SWAP_OUT.name())) - || currentInstanceConfig.getInstanceOperation().isEmpty()) { - // If the already filtered instance is SWAP_IN and this instance is in SWAP_OUT, then replace the filtered - // instance with this instance. If this instance has no InstanceOperation, then replace the filtered instance - // with this instance. This is the case where the SWAP_IN node has been marked as complete or SWAP_IN exists and - // SWAP_OUT does not. There can never be a case where both have no InstanceOperation set. - newAssignableInstanceConfigMap.remove(filteredNode); - newAssignableInstanceConfigMap.put(node, currentInstanceConfig); - filteredInstancesByLogicalId.put(currentInstanceLogicalId, node); - } - } else if (!currentInstanceConfig.getInstanceOperation() - .equals(InstanceConstants.InstanceOperation.EVACUATE.name())) { - // EVACUATE instances are not considered to be assignable. + newInstanceConfigMapByInstanceOperation.computeIfAbsent( + currentInstanceConfig.getInstanceOperation(), k -> new HashMap<>()) + .put(node, currentInstanceConfig); + + if (currentInstanceConfig.isAssignable()) { newAssignableInstanceConfigMap.put(node, currentInstanceConfig); - filteredInstancesByLogicalId.put(currentInstanceLogicalId, node); } if (currentInstanceConfig.getInstanceOperation() - .equals(InstanceConstants.InstanceOperation.SWAP_OUT.name())) { - swapOutLogicalIdsByInstanceName.put(currentInstanceConfig.getInstanceName(), + .equals(InstanceConstants.InstanceOperation.SWAP_IN)) { + swapInLogicalIdsByInstanceName.put(currentInstanceConfig.getInstanceName(), currentInstanceLogicalId); - } - - if (currentInstanceConfig.getInstanceOperation() - .equals(InstanceConstants.InstanceOperation.SWAP_IN.name())) { - swapInInstancesByLogicalId.put( + } else { + nonSwapInInstancesByLogicalId.put( currentInstanceConfig.getLogicalId(clusterTopologyConfig.getEndNodeType()), currentInstanceConfig.getInstanceName()); } @@ -453,25 +437,20 @@ private void updateInstanceSets(Map instanceConfigMap, } }); - swapOutLogicalIdsByInstanceName.forEach((swapOutInstanceName, value) -> { - String swapInInstanceName = swapInInstancesByLogicalId.get(value); - if (swapInInstanceName != null) { - newSwapOutInstanceNameToSwapInInstanceName.put(swapOutInstanceName, swapInInstanceName); + swapInLogicalIdsByInstanceName.forEach((swapInInstanceName, swapInLogicalId) -> { + String swapOutInstanceName = nonSwapInInstancesByLogicalId.get(swapInLogicalId); + if (swapOutInstanceName != null) { + newSwapOutInstanceNameToSwapOutInstanceName.put(swapOutInstanceName, swapInInstanceName); if (liveInstancesMap.containsKey(swapInInstanceName)) { newLiveSwapInInstanceNames.add(swapInInstanceName); } - if (InstanceValidationUtil.isInstanceEnabled(instanceConfigMap.get(swapInInstanceName), - clusterConfig)) { - newEnabledSwapInInstanceNames.add(swapInInstanceName); - } } }); // Replace caches with up-to-date instance sets. - _derivedInstanceCache = - new DerivedInstanceCache(newAssignableInstanceConfigMap, newAssignableLiveInstancesMap, - newSwapOutInstanceNameToSwapInInstanceName, newLiveSwapInInstanceNames, - newEnabledSwapInInstanceNames); + _derivedInstanceCache = new DerivedInstanceCache(newInstanceConfigMapByInstanceOperation, + newAssignableInstanceConfigMap, newAssignableLiveInstancesMap, + newSwapOutInstanceNameToSwapOutInstanceName, newLiveSwapInInstanceNames); } private void refreshResourceConfig(final HelixDataAccessor accessor, @@ -722,78 +701,50 @@ public Set getAllInstances() { } /** - * Return all the live nodes that are enabled and assignable + * Return a set of all instances that have the UNKNOWN InstanceOperation. + * These instances are not assignable and should have all replicas dropped + * immediately. * - * @return A new set contains live instance name and that are marked enabled + * @return A new set contains */ - public Set getAssignableEnabledLiveInstances() { - Set enabledLiveInstances = new HashSet<>(getAssignableLiveInstances().keySet()); - enabledLiveInstances.removeAll(getDisabledInstances()); - - return enabledLiveInstances; + public Set getUnknownInstances() { + return Collections.unmodifiableSet( + _derivedInstanceCache.getInstanceConfigMapByInstanceOperation( + InstanceConstants.InstanceOperation.UNKNOWN).keySet()); } /** - * Return all the live nodes that are enabled + * Return all the live nodes that are enabled. If a node is enabled, it is assignable. * @return A new set contains live instance name and that are marked enabled */ public Set getEnabledLiveInstances() { Set enabledLiveInstances = new HashSet<>(getLiveInstances().keySet()); - enabledLiveInstances.removeAll(getDisabledInstances()); + enabledLiveInstances.retainAll(getEnabledInstances()); return enabledLiveInstances; } /** - * Return all nodes that are enabled and assignable. - * - * @return A new set contains instance name and that are marked enabled - */ - public Set getAssignableEnabledInstances() { - Set enabledNodes = new HashSet<>(getAssignableInstances()); - enabledNodes.removeAll(getDisabledInstances()); - - return enabledNodes; - } - - /** - * Return all nodes that are enabled. + * Return all nodes that are enabled. If a node is enabled, it is assignable. * @return A new set contains instance name and that are marked enabled */ public Set getEnabledInstances() { - Set enabledNodes = new HashSet<>(getAllInstances()); - enabledNodes.removeAll(getDisabledInstances()); - - return enabledNodes; + return new HashSet<>(_derivedInstanceCache.getInstanceConfigMapByInstanceOperation( + InstanceConstants.InstanceOperation.ENABLE).keySet()); } /** - * Return all the live nodes that are enabled and assignable and tagged with given instanceTag. + * Return all the live nodes that are enabled and tagged with given instanceTag. If a node is + * enabled, it is assignable. * * @param instanceTag The instance group tag. * @return A new set contains live instance name and that are marked enabled and have the * specified tag. */ - public Set getAssignableEnabledLiveInstancesWithTag(String instanceTag) { - Set enabledLiveInstancesWithTag = new HashSet<>(getAssignableLiveInstances().keySet()); - Set instancesWithTag = getAssignableInstancesWithTag(instanceTag); - enabledLiveInstancesWithTag.retainAll(instancesWithTag); - enabledLiveInstancesWithTag.removeAll(getDisabledInstances()); - - return enabledLiveInstancesWithTag; - } - - /** - * Return all the live nodes that are enabled and tagged with given instanceTag. - * @param instanceTag The instance group tag. - * @return A new set contains live instance name and that are marked enabled and have the - * specified tag. - */ public Set getEnabledLiveInstancesWithTag(String instanceTag) { - Set enabledLiveInstancesWithTag = new HashSet<>(getLiveInstances().keySet()); + Set enabledLiveInstancesWithTag = new HashSet<>(getEnabledLiveInstances()); Set instancesWithTag = getAssignableInstancesWithTag(instanceTag); enabledLiveInstancesWithTag.retainAll(instancesWithTag); - enabledLiveInstancesWithTag.removeAll(getDisabledInstances()); return enabledLiveInstancesWithTag; } @@ -858,9 +809,9 @@ public Set getDisabledInstances() { } /** - * Get all swapping instance pairs. + * Get all swapping instance pairs keyed by swap-out instanceNames. * - * @return a map of SWAP_OUT instanceNames and their corresponding SWAP_IN instanceNames. + * @return a map of swap out instanceNames and their corresponding SWAP_IN instanceNames. */ public Map getSwapOutToSwapInInstancePairs() { return Collections.unmodifiableMap( @@ -868,21 +819,22 @@ public Map getSwapOutToSwapInInstancePairs() { } /** - * Get all the live SWAP_IN instances. + * Get all swapping instance pairs keyed by swap-in instanceNames. * - * @return a set of SWAP_IN instanceNames that have a corresponding SWAP_OUT instance. + * @return a map of swap in instanceNames and their corresponding swap out instanceNames. */ - public Set getLiveSwapInInstanceNames() { - return Collections.unmodifiableSet(_derivedInstanceCache._liveSwapInInstanceNames); + public Map getSwapInToSwapOutInstancePairs() { + return Collections.unmodifiableMap( + _derivedInstanceCache._swapInInstanceNameToSwapOutInstanceName); } /** - * Get all the enabled SWAP_IN instances. + * Get all the live SWAP_IN instances. * - * @return a set of SWAP_IN instanceNames that have a corresponding SWAP_OUT instance. + * @return a set of SWAP_IN instanceNames that have a corresponding swap out instance. */ - public Set getEnabledSwapInInstanceNames() { - return Collections.unmodifiableSet(_derivedInstanceCache._enabledSwapInInstanceNames); + public Set getLiveSwapInInstanceNames() { + return Collections.unmodifiableSet(_derivedInstanceCache._liveSwapInInstanceNames); } public synchronized void setLiveInstances(List liveInstances) { @@ -1127,7 +1079,7 @@ private void updateDisabledInstances(Collection allInstanceConfi _disabledInstanceSet.clear(); for (InstanceConfig config : allInstanceConfigs) { Map> disabledPartitionMap = config.getDisabledPartitionsMap(); - if (!InstanceValidationUtil.isInstanceEnabled(config, clusterConfig)) { + if (config.getInstanceOperation().equals(InstanceConstants.InstanceOperation.DISABLE)) { _disabledInstanceSet.add(config.getInstanceName()); } for (String resource : disabledPartitionMap.keySet()) { diff --git a/helix-core/src/main/java/org/apache/helix/controller/rebalancer/AbstractRebalancer.java b/helix-core/src/main/java/org/apache/helix/controller/rebalancer/AbstractRebalancer.java index 160e4eda6f..477ef2032c 100644 --- a/helix-core/src/main/java/org/apache/helix/controller/rebalancer/AbstractRebalancer.java +++ b/helix-core/src/main/java/org/apache/helix/controller/rebalancer/AbstractRebalancer.java @@ -362,12 +362,19 @@ protected Map computeBestPossibleMap(List preferenceList } } + // TODO: Consider moving this logic to assignStatesToInstances since we are already passing + // disabledInstancesForPartition to that method. // (2) Set initial-state to certain instances that are disabled and in preference list. // Be careful with the conditions. for (String instance : preferenceList) { if (disabledInstancesForPartition.contains(instance)) { if (currentStateMap.containsKey(instance)) { - if (!currentStateMap.get(instance).equals(HelixDefinedState.ERROR.name())) { + if (currentStateMap.get(instance).equals(HelixDefinedState.ERROR.name())) { + // Must set to ERROR state here because assignStatesToInstances will not assign + // any state for disabledInstancesForPartition. This prevents the ERROR partition + // from being DROPPED. + bestPossibleStateMap.put(instance, HelixDefinedState.ERROR.name()); + } else { bestPossibleStateMap.put(instance, stateModelDef.getInitialState()); } } else { diff --git a/helix-core/src/main/java/org/apache/helix/controller/rebalancer/DelayedAutoRebalancer.java b/helix-core/src/main/java/org/apache/helix/controller/rebalancer/DelayedAutoRebalancer.java index 252b63255a..d55a6eae83 100644 --- a/helix-core/src/main/java/org/apache/helix/controller/rebalancer/DelayedAutoRebalancer.java +++ b/helix-core/src/main/java/org/apache/helix/controller/rebalancer/DelayedAutoRebalancer.java @@ -95,7 +95,7 @@ public IdealState computeNewIdealState(String resourceName, String instanceTag = currentIdealState.getInstanceGroupTag(); if (instanceTag != null) { - assignableLiveEnabledNodes = clusterData.getAssignableEnabledLiveInstancesWithTag(instanceTag); + assignableLiveEnabledNodes = clusterData.getEnabledLiveInstancesWithTag(instanceTag); assignableNodes = clusterData.getAssignableInstancesWithTag(instanceTag); if (LOG.isInfoEnabled()) { @@ -105,7 +105,7 @@ public IdealState computeNewIdealState(String resourceName, currentIdealState.getInstanceGroupTag(), resourceName, assignableNodes, assignableLiveEnabledNodes)); } } else { - assignableLiveEnabledNodes = clusterData.getAssignableEnabledLiveInstances(); + assignableLiveEnabledNodes = clusterData.getEnabledLiveInstances(); assignableNodes = clusterData.getAssignableInstances(); } @@ -246,7 +246,7 @@ public ResourceAssignment computeBestPossiblePartitionState(ResourceControllerDa LOG.debug("Processing resource:" + resource.getResourceName()); } - Set allNodes = cache.getAssignableEnabledInstances(); + Set allNodes = cache.getEnabledInstances(); Set liveNodes = cache.getAssignableLiveInstances().keySet(); ClusterConfig clusterConfig = cache.getClusterConfig(); @@ -268,7 +268,8 @@ public ResourceAssignment computeBestPossiblePartitionState(ResourceControllerDa Map bestStateForPartition = // We use cache.getLiveInstances().keySet() to make sure we gracefully handle n -> n + 1 replicas if possible // when the one of the current nodes holding the replica is no longer considered assignable. (ex: EVACUATE) - computeBestPossibleStateForPartition(cache.getLiveInstances().keySet(), stateModelDef, preferenceList, + computeBestPossibleStateForPartition(cache.getLiveInstances().keySet(), + stateModelDef, preferenceList, currentStateOutput, disabledInstancesForPartition, idealState, clusterConfig, partition, cache.getAbnormalStateResolver(stateModelDefName), cache); @@ -328,6 +329,7 @@ protected Map computeBestPossibleStateForPartition(Set l while (it.hasNext()) { String instance = it.next(); String state = currentStateMap.get(instance); + // TODO: This may never be a possible case, figure out if we can safely remove this. if (state == null) { it.remove(); instancesToDrop.add(instance); // These instances should be set to DROPPED after we get bestPossibleStateMap; @@ -405,6 +407,8 @@ protected Map computeBestPossibleStateForPartition(Set l } } + // TODO: This may not be necessary, all of the instances bestPossibleStateMap should be set to ERROR + // if necessary in the call to computeBestPossibleMap. // Adding ERROR replica mapping to best possible // ERROR assignment should be mutual excluded from DROPPED assignment because // once there is an ERROR replica in the mapping, bestPossibleStateMap.size() > numReplicas prevents diff --git a/helix-core/src/main/java/org/apache/helix/controller/rebalancer/topology/Topology.java b/helix-core/src/main/java/org/apache/helix/controller/rebalancer/topology/Topology.java index 335c30fdf2..2618275b13 100644 --- a/helix-core/src/main/java/org/apache/helix/controller/rebalancer/topology/Topology.java +++ b/helix-core/src/main/java/org/apache/helix/controller/rebalancer/topology/Topology.java @@ -205,7 +205,7 @@ private Node createClusterTree(ClusterConfig clusterConfig, boolean faultZoneLev } addEndNode(root, instanceName, instanceTopologyMap, weight, _liveInstances); } catch (IllegalArgumentException e) { - if (InstanceValidationUtil.isInstanceEnabled(insConfig, clusterConfig)) { + if (insConfig.getInstanceEnabled()) { throw e; } else { logger.warn("Topology setting {} for instance {} is unset or invalid, ignore the instance!", diff --git a/helix-core/src/main/java/org/apache/helix/controller/rebalancer/util/DelayedRebalanceUtil.java b/helix-core/src/main/java/org/apache/helix/controller/rebalancer/util/DelayedRebalanceUtil.java index 2796064db0..90da408b1a 100644 --- a/helix-core/src/main/java/org/apache/helix/controller/rebalancer/util/DelayedRebalanceUtil.java +++ b/helix-core/src/main/java/org/apache/helix/controller/rebalancer/util/DelayedRebalanceUtil.java @@ -29,19 +29,16 @@ import java.util.stream.Collectors; import org.apache.helix.HelixManager; -import org.apache.helix.constants.InstanceConstants; import org.apache.helix.controller.dataproviders.ResourceControllerDataProvider; import org.apache.helix.controller.rebalancer.waged.model.AssignableReplica; import org.apache.helix.controller.rebalancer.waged.model.ClusterModelProvider; import org.apache.helix.model.ClusterConfig; -import org.apache.helix.model.ClusterTopologyConfig; import org.apache.helix.model.IdealState; import org.apache.helix.model.InstanceConfig; import org.apache.helix.model.Partition; import org.apache.helix.model.ResourceAssignment; import org.apache.helix.model.ResourceConfig; import org.apache.helix.util.InstanceValidationUtil; -import org.apache.helix.zookeeper.datamodel.ZNRecord; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -157,7 +154,7 @@ private static long getInactiveTime(String instance, Set liveInstances, } // check the time instance got disabled. - if (!InstanceValidationUtil.isInstanceEnabled(instanceConfig, clusterConfig)) { + if (!instanceConfig.getInstanceEnabled()) { long disabledTime = instanceConfig.getInstanceEnabledTime(); String batchedDisabledTime = clusterConfig.getInstanceHelixDisabledTimeStamp(instance); if (batchedDisabledTime != null && !batchedDisabledTime.isEmpty()) { @@ -409,7 +406,7 @@ private static List findPartitionsMissingMinActiveReplica( ResourceAssignment resourceAssignment) { String resourceName = resourceAssignment.getResourceName(); IdealState currentIdealState = clusterData.getIdealState(resourceName); - Set enabledLiveInstances = clusterData.getAssignableEnabledLiveInstances(); + Set enabledLiveInstances = clusterData.getEnabledLiveInstances(); int numReplica = currentIdealState.getReplicaCount(enabledLiveInstances.size()); int minActiveReplica = DelayedRebalanceUtil.getMinActiveReplica(ResourceConfig .mergeIdealStateWithResourceConfig(clusterData.getResourceConfig(resourceName), @@ -430,7 +427,7 @@ private static List findPartitionsMissingMinActiveReplica( private static int getMinActiveReplica(ResourceControllerDataProvider clusterData, String resourceName) { IdealState currentIdealState = clusterData.getIdealState(resourceName); - Set enabledLiveInstances = clusterData.getAssignableEnabledLiveInstances(); + Set enabledLiveInstances = clusterData.getEnabledLiveInstances(); int numReplica = currentIdealState.getReplicaCount(enabledLiveInstances.size()); return DelayedRebalanceUtil.getMinActiveReplica(ResourceConfig .mergeIdealStateWithResourceConfig(clusterData.getResourceConfig(resourceName), diff --git a/helix-core/src/main/java/org/apache/helix/controller/rebalancer/waged/WagedRebalancer.java b/helix-core/src/main/java/org/apache/helix/controller/rebalancer/waged/WagedRebalancer.java index ae7e49a1d5..39a197bff5 100644 --- a/helix-core/src/main/java/org/apache/helix/controller/rebalancer/waged/WagedRebalancer.java +++ b/helix-core/src/main/java/org/apache/helix/controller/rebalancer/waged/WagedRebalancer.java @@ -304,7 +304,7 @@ private Map computeBestPossibleStates( Set activeNodes = DelayedRebalanceUtil.getActiveNodes(clusterData.getAssignableInstances(), - clusterData.getAssignableEnabledLiveInstances(), + clusterData.getEnabledLiveInstances(), clusterData.getInstanceOfflineTimeMap(), clusterData.getAssignableLiveInstances().keySet(), clusterData.getAssignableInstanceConfigMap(), clusterData.getClusterConfig()); @@ -401,7 +401,7 @@ private Map handleDelayedRebalanceMinActiveReplica( RebalanceAlgorithm algorithm) throws HelixRebalanceException { // the "real" live nodes at the time - final Set enabledLiveInstances = clusterData.getAssignableEnabledLiveInstances(); + final Set enabledLiveInstances = clusterData.getEnabledLiveInstances(); if (activeNodes.equals(enabledLiveInstances) || !requireRebalanceOverwrite(clusterData, currentResourceAssignment)) { // no need for additional process, return the current resource assignment @@ -602,7 +602,7 @@ private void delayedRebalanceSchedule(ResourceControllerDataProvider clusterData ClusterConfig clusterConfig = clusterData.getClusterConfig(); boolean delayedRebalanceEnabled = DelayedRebalanceUtil.isDelayRebalanceEnabled(clusterConfig); Set offlineOrDisabledInstances = new HashSet<>(delayedActiveNodes); - offlineOrDisabledInstances.removeAll(clusterData.getAssignableEnabledLiveInstances()); + offlineOrDisabledInstances.removeAll(clusterData.getEnabledLiveInstances()); for (String resource : resourceSet) { DelayedRebalanceUtil .setRebalanceScheduler(resource, delayedRebalanceEnabled, offlineOrDisabledInstances, @@ -623,7 +623,7 @@ protected boolean requireRebalanceOverwrite(ResourceControllerDataProvider clust String resourceName = resourceAssignment.getResourceName(); IdealState currentIdealState = clusterData.getIdealState(resourceName); - Set enabledLiveInstances = clusterData.getAssignableEnabledLiveInstances(); + Set enabledLiveInstances = clusterData.getEnabledLiveInstances(); int numReplica = currentIdealState.getReplicaCount(enabledLiveInstances.size()); int minActiveReplica = DelayedRebalanceUtil.getMinActiveReplica(ResourceConfig diff --git a/helix-core/src/main/java/org/apache/helix/controller/rebalancer/waged/model/ClusterModelProvider.java b/helix-core/src/main/java/org/apache/helix/controller/rebalancer/waged/model/ClusterModelProvider.java index ddd9880c0b..75151d3363 100644 --- a/helix-core/src/main/java/org/apache/helix/controller/rebalancer/waged/model/ClusterModelProvider.java +++ b/helix-core/src/main/java/org/apache/helix/controller/rebalancer/waged/model/ClusterModelProvider.java @@ -164,7 +164,7 @@ public static ClusterModel generateClusterModelFromExistingAssignment( ResourceControllerDataProvider dataProvider, Map resourceMap, Map currentStateAssignment) { return generateClusterModel(dataProvider, resourceMap, - dataProvider.getAssignableEnabledLiveInstances(), Collections.emptyMap(), + dataProvider.getEnabledLiveInstances(), Collections.emptyMap(), Collections.emptyMap(), currentStateAssignment, RebalanceScopeType.GLOBAL_BASELINE); } diff --git a/helix-core/src/main/java/org/apache/helix/controller/stages/AttributeName.java b/helix-core/src/main/java/org/apache/helix/controller/stages/AttributeName.java index 8771bff64b..0db5252ee0 100644 --- a/helix-core/src/main/java/org/apache/helix/controller/stages/AttributeName.java +++ b/helix-core/src/main/java/org/apache/helix/controller/stages/AttributeName.java @@ -24,6 +24,7 @@ public enum AttributeName { RESOURCES_TO_REBALANCE, BEST_POSSIBLE_STATE, CURRENT_STATE, + CURRENT_STATE_EXCLUDING_UNKNOWN, CUSTOMIZED_STATE, INTERMEDIATE_STATE, MESSAGES_ALL, diff --git a/helix-core/src/main/java/org/apache/helix/controller/stages/BestPossibleStateCalcStage.java b/helix-core/src/main/java/org/apache/helix/controller/stages/BestPossibleStateCalcStage.java index 2a6f9644e9..1db0eccfca 100644 --- a/helix-core/src/main/java/org/apache/helix/controller/stages/BestPossibleStateCalcStage.java +++ b/helix-core/src/main/java/org/apache/helix/controller/stages/BestPossibleStateCalcStage.java @@ -30,10 +30,13 @@ import java.util.concurrent.Callable; import java.util.stream.Collectors; +import org.apache.helix.HelixDefinedState; import org.apache.helix.HelixException; import org.apache.helix.HelixManager; import org.apache.helix.HelixRebalanceException; +import org.apache.helix.constants.InstanceConstants; import org.apache.helix.controller.LogUtil; +import org.apache.helix.controller.common.ResourcesStateMap; import org.apache.helix.controller.dataproviders.ResourceControllerDataProvider; import org.apache.helix.controller.pipeline.AbstractBaseStage; import org.apache.helix.controller.pipeline.StageException; @@ -74,7 +77,8 @@ public class BestPossibleStateCalcStage extends AbstractBaseStage { @Override public void process(ClusterEvent event) throws Exception { _eventId = event.getEventId(); - CurrentStateOutput currentStateOutput = event.getAttribute(AttributeName.CURRENT_STATE.name()); + CurrentStateOutput currentStateOutput = + event.getAttribute(AttributeName.CURRENT_STATE_EXCLUDING_UNKNOWN.name()); final Map resourceMap = event.getAttribute(AttributeName.RESOURCES_TO_REBALANCE.name()); final ClusterStatusMonitor clusterStatusMonitor = @@ -83,8 +87,8 @@ public void process(ClusterEvent event) throws Exception { event.getAttribute(AttributeName.ControllerDataProvider.name()); if (currentStateOutput == null || resourceMap == null || cache == null) { - throw new StageException( - "Missing attributes in event:" + event + ". Requires CURRENT_STATE|RESOURCES|DataCache"); + throw new StageException("Missing attributes in event:" + event + + ". Requires CURRENT_STATE_EXCLUDING_UNKNOWN|RESOURCES|DataCache"); } final BestPossibleStateOutput bestPossibleStateOutput = @@ -131,82 +135,104 @@ public void process(ClusterEvent event) throws Exception { }); } + private String selectSwapInState(StateModelDefinition stateModelDef, Map stateMap, + String swapOutInstance) { + // If the swap-in node is live, select state with the following logic: + // 1. If the swap-out instance's replica is in the stateMap: + // - if the swap-out instance's replica is a topState, select the swap-in instance's replica to the topState. + // if another is allowed to be added, otherwise select the swap-in instance's replica to a secondTopState. + // - if the swap-out instance's replica is not a topState or ERROR, select the swap-in instance's replica to the same state. + // - if the swap-out instance's replica is ERROR, select the swap-in instance's replica to the initialState. + // 2. If the swap-out instance's replica is not in the stateMap, select the swap-in instance's replica to the initialState. + // This happens when the swap-out node is offline. + if (stateMap.containsKey(swapOutInstance)) { + if (stateMap.get(swapOutInstance).equals(stateModelDef.getTopState()) || stateMap.get( + swapOutInstance).equals(HelixDefinedState.ERROR.name())) { + // If the swap-out instance's replica is a topState, select the swap-in instance's replica + // to be the topState if the StateModel allows another to be added. If not, select the swap-in + // to be the secondTopState. + String topStateCount = stateModelDef.getNumInstancesPerState(stateModelDef.getTopState()); + if (topStateCount.equals(StateModelDefinition.STATE_REPLICA_COUNT_ALL_CANDIDATE_NODES) + || topStateCount.equals(StateModelDefinition.STATE_REPLICA_COUNT_ALL_REPLICAS)) { + // If the StateModel allows for another replica with the topState to be added, + // select the swap-in instance's replica to the topState. + return stateModelDef.getTopState(); + } + // If StateModel does not allow another topState replica to be + // added, select the swap-in instance's replica to be the secondTopState. + return stateModelDef.getSecondTopStates().iterator().next(); + } + // If the swap-out instance's replica is not a topState or ERROR, select the swap-in instance's replica + // to be the same state + return stateMap.get(swapOutInstance); + } + // If the swap-out instance's replica is not in the stateMap, return null + return null; + } + private void addSwapInInstancesToBestPossibleState(Map resourceMap, BestPossibleStateOutput bestPossibleStateOutput, ResourceControllerDataProvider cache) { - // 1. Get all SWAP_OUT instances and corresponding SWAP_IN instance pairs in the cluster. + // 1. Get all swap out instances and corresponding SWAP_IN instance pairs in the cluster. Map swapOutToSwapInInstancePairs = cache.getSwapOutToSwapInInstancePairs(); - // 2. Get all enabled and live SWAP_IN instances in the cluster. + Map swapInToSwapOutInstancePairs = cache.getSwapInToSwapOutInstancePairs(); + + // 2. Get all live SWAP_IN instances in the cluster. Set liveSwapInInstances = cache.getLiveSwapInInstanceNames(); - Set enabledSwapInInstances = cache.getEnabledSwapInInstanceNames(); - // 3. For each SWAP_OUT instance in any of the preferenceLists, add the corresponding SWAP_IN instance to - // the stateMap with the correct state. - // Skipping this when there are no SWAP_IN instances that are alive will reduce computation time. - if (!liveSwapInInstances.isEmpty() && !cache.isMaintenanceModeEnabled()) { - resourceMap.forEach((resourceName, resource) -> { - StateModelDefinition stateModelDef = cache.getStateModelDef(resource.getStateModelDefRef()); - bestPossibleStateOutput.getResourceStatesMap().get(resourceName).getStateMap() - .forEach((partition, stateMap) -> { - // We use the preferenceList for the case where the swapOutInstance goes offline. - // We do not want to drop the replicas that may have been bootstrapped on the swapInInstance - // in the case that the swapOutInstance goes offline and no longer has an entry in the stateMap. - Set commonInstances = new HashSet<>( - bestPossibleStateOutput.getPreferenceList(resourceName, - partition.getPartitionName())); - commonInstances.retainAll(swapOutToSwapInInstancePairs.keySet()); - - commonInstances.forEach(swapOutInstance -> { - // If the corresponding swap-in instance is not live, skip assigning to it. - if (!liveSwapInInstances.contains( - swapOutToSwapInInstancePairs.get(swapOutInstance))) { - return; - } - - // If the corresponding swap-in instance is not enabled, assign replicas with - // initial state. - if (!enabledSwapInInstances.contains( - swapOutToSwapInInstancePairs.get(swapOutInstance))) { - stateMap.put(swapOutToSwapInInstancePairs.get(swapOutInstance), - stateModelDef.getInitialState()); - return; - } - - // If the swap-in node is live and enabled, do assignment with the following logic: - // 1. If the swap-out instance's replica is a secondTopState, set the swap-in instance's replica - // to the same secondTopState. - // 2. If the swap-out instance's replica is any other state and is in the preferenceList, - // set the swap-in instance's replica to the topState if the StateModel allows another to be added. - // If not, set the swap-in instance's replica to the secondTopState. - // We can make this assumption because if there is assignment to the swapOutInstance, it must be either - // a topState or a secondTopState. - if (stateMap.containsKey(swapOutInstance) && stateModelDef.getSecondTopStates() - .contains(stateMap.get(swapOutInstance))) { - // If the swap-out instance's replica is a secondTopState, set the swap-in instance's replica - // to the same secondTopState. - stateMap.put(swapOutToSwapInInstancePairs.get(swapOutInstance), - stateMap.get(swapOutInstance)); - } else { - // If the swap-out instance's replica is any other state in the stateMap or not present in the - // stateMap, set the swap-in instance's replica to the topState if the StateModel allows another - // to be added. If not, set the swap-in to the secondTopState. - String topStateCount = - stateModelDef.getNumInstancesPerState(stateModelDef.getTopState()); - if (topStateCount.equals( - StateModelDefinition.STATE_REPLICA_COUNT_ALL_CANDIDATE_NODES) - || topStateCount.equals( - StateModelDefinition.STATE_REPLICA_COUNT_ALL_REPLICAS)) { - // If the StateModel allows for another replica with the topState to be added, - // set the swap-in instance's replica to the topState. - stateMap.put(swapOutToSwapInInstancePairs.get(swapOutInstance), - stateModelDef.getTopState()); - } else { - // If StateModel does not allow another topState replica to be - // added, set the swap-in instance's replica to the secondTopState. - stateMap.put(swapOutToSwapInInstancePairs.get(swapOutInstance), - stateModelDef.getSecondTopStates().iterator().next()); - } - } - }); + if (liveSwapInInstances.isEmpty() || cache.isMaintenanceModeEnabled()) { + return; + } + + // 3. Find the assignment for each swap-in instance + // : : + Map>> swapInInstanceAssignment = new HashMap<>(); + resourceMap.forEach((resourceName, resource) -> { + bestPossibleStateOutput.getResourceStatesMap().get(resourceName).getStateMap() + .forEach((partition, stateMap) -> { + // We use the preferenceList for the case where the swapOutInstance goes offline. + // We do not want to drop the replicas that may have been bootstrapped on the swapInInstance + // in the case that the swapOutInstance goes offline and no longer has an entry in the stateMap. + Set commonInstances = + bestPossibleStateOutput.getInstanceStateMap(resourceName, partition) != null + ? new HashSet<>( + bestPossibleStateOutput.getInstanceStateMap(resourceName, partition).keySet()) + : Collections.emptySet(); + if (commonInstances.isEmpty()) { + return; + } + commonInstances.retainAll(swapOutToSwapInInstancePairs.keySet()); + + commonInstances.forEach(swapOutInstance -> { + swapInInstanceAssignment.computeIfAbsent( + swapOutToSwapInInstancePairs.get(swapOutInstance), k -> new HashMap<>()) + .computeIfAbsent(resourceName, k -> new HashSet<>()) + .add(partition.getPartitionName()); }); + }); + }); + + // 4. Add the correct states for the swap-in instances to the bestPossibleStateOutput. + if (!swapInInstanceAssignment.isEmpty()) { + swapInInstanceAssignment.forEach((swapInInstance, resourceMapForInstance) -> { + // If the corresponding swap-in instance is not live, skip assigning to it. + if (!liveSwapInInstances.contains(swapInInstance)) { + return; + } + + resourceMapForInstance.forEach((resourceName, partitions) -> { + partitions.forEach(partitionName -> { + Partition partition = new Partition(partitionName); + Map stateMap = + bestPossibleStateOutput.getInstanceStateMap(resourceName, partition); + + String selectedState = selectSwapInState( + cache.getStateModelDef(resourceMap.get(resourceName).getStateModelDefRef()), + stateMap, swapInToSwapOutInstancePairs.get(swapInInstance)); + if (stateMap != null) { + bestPossibleStateOutput.setState(resourceName, partition, swapInInstance, + selectedState); + } + }); + }); }); } } @@ -250,7 +276,7 @@ private BestPossibleStateOutput compute(ClusterEvent event, Map failureResources = new ArrayList<>(); @@ -323,25 +349,32 @@ public Object call() { }); } - // Check whether the offline/disabled instance count in the cluster reaches the set limit, + // Check whether the offline/unable to accept online replicas instance count in the cluster reaches the set limit, // if yes, auto enable maintenance mode, and use the maintenance rebalancer for this pipeline. - private boolean validateOfflineInstancesLimit(final ResourceControllerDataProvider cache, + private boolean validateInstancesUnableToAcceptOnlineReplicasLimit(final ResourceControllerDataProvider cache, final HelixManager manager) { - int maxOfflineInstancesAllowed = cache.getClusterConfig().getMaxOfflineInstancesAllowed(); - if (maxOfflineInstancesAllowed >= 0) { - int offlineCount = - cache.getAssignableInstances().size() - cache.getAssignableEnabledLiveInstances().size(); - if (offlineCount > maxOfflineInstancesAllowed) { + int maxInstancesUnableToAcceptOnlineReplicas = + cache.getClusterConfig().getMaxOfflineInstancesAllowed(); + if (maxInstancesUnableToAcceptOnlineReplicas >= 0) { + // Instead of only checking the offline instances, we consider how many instances in the cluster + // are not assignable and live. This is because some instances may be online but have an unassignable + // InstanceOperation such as EVACUATE, DISABLE, or UNKNOWN. We will exclude SWAP_IN instances from + // they should not account against the capacity of the cluster. + int instancesUnableToAcceptOnlineReplicas = cache.getInstanceConfigMap().entrySet().stream() + .filter(instanceEntry -> !InstanceConstants.UNSERVABLE_INSTANCE_OPERATIONS.contains( + instanceEntry.getValue().getInstanceOperation())).collect(Collectors.toSet()) + .size() - cache.getEnabledLiveInstances().size(); + if (instancesUnableToAcceptOnlineReplicas > maxInstancesUnableToAcceptOnlineReplicas) { String errMsg = String.format( - "Offline Instances count %d greater than allowed count %d. Put cluster %s into " - + "maintenance mode.", - offlineCount, maxOfflineInstancesAllowed, cache.getClusterName()); + "Instances unable to take ONLINE replicas count %d greater than allowed count %d. Put cluster %s into " + + "maintenance mode.", instancesUnableToAcceptOnlineReplicas, + maxInstancesUnableToAcceptOnlineReplicas, cache.getClusterName()); if (manager != null) { if (manager.getHelixDataAccessor() .getProperty(manager.getHelixDataAccessor().keyBuilder().maintenance()) == null) { manager.getClusterManagmentTool() .autoEnableMaintenanceMode(manager.getClusterName(), true, errMsg, - MaintenanceSignal.AutoTriggerReason.MAX_OFFLINE_INSTANCES_EXCEEDED); + MaintenanceSignal.AutoTriggerReason.MAX_INSTANCES_UNABLE_TO_ACCEPT_ONLINE_REPLICAS); LogUtil.logWarn(logger, _eventId, errMsg); } } else { diff --git a/helix-core/src/main/java/org/apache/helix/controller/stages/CurrentStateComputationStage.java b/helix-core/src/main/java/org/apache/helix/controller/stages/CurrentStateComputationStage.java index 6fbb2b63e5..3bf23d22ef 100644 --- a/helix-core/src/main/java/org/apache/helix/controller/stages/CurrentStateComputationStage.java +++ b/helix-core/src/main/java/org/apache/helix/controller/stages/CurrentStateComputationStage.java @@ -28,6 +28,7 @@ import java.util.concurrent.ExecutorService; import java.util.stream.Collectors; +import org.apache.helix.constants.InstanceConstants; import org.apache.helix.controller.LogUtil; import org.apache.helix.controller.dataproviders.BaseControllerDataProvider; import org.apache.helix.controller.dataproviders.ResourceControllerDataProvider; @@ -43,6 +44,7 @@ import org.apache.helix.controller.rebalancer.waged.model.ClusterModelProvider; import org.apache.helix.model.CurrentState; import org.apache.helix.model.IdealState; +import org.apache.helix.model.InstanceConfig; import org.apache.helix.model.LiveInstance; import org.apache.helix.model.Message; import org.apache.helix.model.Message.MessageType; @@ -86,24 +88,39 @@ public void process(ClusterEvent event) throws Exception { Map liveInstances = cache.getLiveInstances(); final CurrentStateOutput currentStateOutput = new CurrentStateOutput(); + final CurrentStateOutput currentStateExcludingUnknown = new CurrentStateOutput(); for (LiveInstance instance : liveInstances.values()) { String instanceName = instance.getInstanceName(); String instanceSessionId = instance.getEphemeralOwner(); + InstanceConfig instanceConfig = cache.getInstanceConfigMap().get(instanceName); + + Set existingStaleMessages = cache.getStaleMessagesByInstance(instanceName); + Map messages = cache.getMessages(instanceName); + Map relayMessages = cache.getRelayMessages(instanceName); // update current states. updateCurrentStates(instance, cache.getCurrentState(instanceName, instanceSessionId, _isTaskFrameworkPipeline).values(), currentStateOutput, resourceMap); - - Set existingStaleMessages = cache.getStaleMessagesByInstance(instanceName); // update pending messages - Map messages = cache.getMessages(instanceName); - Map relayMessages = cache.getRelayMessages(instanceName); updatePendingMessages(instance, cache, messages.values(), relayMessages.values(), existingStaleMessages, currentStateOutput, resourceMap); + + // Only update the currentStateExcludingUnknown if the instance is not in UNKNOWN InstanceOperation. + if (instanceConfig == null || !instanceConfig.getInstanceOperation() + .equals(InstanceConstants.InstanceOperation.UNKNOWN)) { + // update current states. + updateCurrentStates(instance, + cache.getCurrentState(instanceName, instanceSessionId, _isTaskFrameworkPipeline) + .values(), currentStateExcludingUnknown, resourceMap); + // update pending messages + updatePendingMessages(instance, cache, messages.values(), relayMessages.values(), + existingStaleMessages, currentStateExcludingUnknown, resourceMap); + } } event.addAttribute(AttributeName.CURRENT_STATE.name(), currentStateOutput); + event.addAttribute(AttributeName.CURRENT_STATE_EXCLUDING_UNKNOWN.name(), currentStateExcludingUnknown); final ClusterStatusMonitor clusterStatusMonitor = event.getAttribute(AttributeName.clusterStatusMonitor.name()); diff --git a/helix-core/src/main/java/org/apache/helix/controller/stages/IntermediateStateCalcStage.java b/helix-core/src/main/java/org/apache/helix/controller/stages/IntermediateStateCalcStage.java index b3990046c0..ba2e16018f 100644 --- a/helix-core/src/main/java/org/apache/helix/controller/stages/IntermediateStateCalcStage.java +++ b/helix-core/src/main/java/org/apache/helix/controller/stages/IntermediateStateCalcStage.java @@ -661,11 +661,11 @@ private Map getRequiredStates(String resourceName, // preference list if (preferenceList != null) { return stateModelDefinition.getStateCountMap((int) preferenceList.stream().filter( - i -> resourceControllerDataProvider.getAssignableEnabledLiveInstances().contains(i)) + i -> resourceControllerDataProvider.getEnabledLiveInstances().contains(i)) .count(), requiredNumReplica); // StateModelDefinition's counts } return stateModelDefinition.getStateCountMap( - resourceControllerDataProvider.getAssignableEnabledLiveInstances().size(), + resourceControllerDataProvider.getEnabledLiveInstances().size(), requiredNumReplica); // StateModelDefinition's counts } diff --git a/helix-core/src/main/java/org/apache/helix/controller/stages/MaintenanceRecoveryStage.java b/helix-core/src/main/java/org/apache/helix/controller/stages/MaintenanceRecoveryStage.java index d262d14023..1a5185a052 100644 --- a/helix-core/src/main/java/org/apache/helix/controller/stages/MaintenanceRecoveryStage.java +++ b/helix-core/src/main/java/org/apache/helix/controller/stages/MaintenanceRecoveryStage.java @@ -82,6 +82,7 @@ public void execute(final ClusterEvent event) throws Exception { String reason; switch (internalReason) { case MAX_OFFLINE_INSTANCES_EXCEEDED: + case MAX_INSTANCES_UNABLE_TO_ACCEPT_ONLINE_REPLICAS: // Check on the number of offline/disabled instances int numOfflineInstancesForAutoExit = cache.getClusterConfig().getNumOfflineInstancesForAutoExit(); @@ -90,7 +91,7 @@ public void execute(final ClusterEvent event) throws Exception { } // Get the count of all instances that are either offline or disabled int offlineDisabledCount = - cache.getAssignableInstances().size() - cache.getAssignableEnabledLiveInstances().size(); + cache.getAssignableInstances().size() - cache.getEnabledLiveInstances().size(); shouldExitMaintenance = offlineDisabledCount <= numOfflineInstancesForAutoExit; reason = String.format( "Auto-exiting maintenance mode for cluster %s; Num. of offline/disabled instances is %d, less than or equal to the exit threshold %d", diff --git a/helix-core/src/main/java/org/apache/helix/controller/stages/MessageGenerationPhase.java b/helix-core/src/main/java/org/apache/helix/controller/stages/MessageGenerationPhase.java index 5c22c11dba..859c6679e9 100644 --- a/helix-core/src/main/java/org/apache/helix/controller/stages/MessageGenerationPhase.java +++ b/helix-core/src/main/java/org/apache/helix/controller/stages/MessageGenerationPhase.java @@ -135,6 +135,8 @@ private void generateMessage(final Resource resource, final BaseControllerDataPr new HashMap<>(resourcesStateMap.getInstanceStateMap(resourceName, partition)); Map pendingStateMap = currentStateOutput.getPendingStateMap(resourceName, partition); + Map currentStateMap = + currentStateOutput.getCurrentStateMap(resourceName, partition); // The operation is combing pending state with best possible state. Since some replicas have // been moved from one instance to another, the instance will exist in pending state but not @@ -146,6 +148,16 @@ private void generateMessage(final Resource resource, final BaseControllerDataPr } } + // Look through the current state map and add DROPPED message if the instance is not in the + // resourceStateMap. This instance may not have had been dropped by the rebalance strategy. + // This check is required to ensure that the instances removed from the ideal state stateMap + // are properly dropped. + for (String instance : currentStateMap.keySet()) { + if (!instanceStateMap.containsKey(instance)) { + instanceStateMap.put(instance, HelixDefinedState.DROPPED.name()); + } + } + // we should generate message based on the desired-state priority // so keep generated messages in a temp map keyed by state // desired-state->list of generated-messages diff --git a/helix-core/src/main/java/org/apache/helix/controller/stages/ReadClusterDataStage.java b/helix-core/src/main/java/org/apache/helix/controller/stages/ReadClusterDataStage.java index 7e8bde9d1b..6a0ae76fc5 100644 --- a/helix-core/src/main/java/org/apache/helix/controller/stages/ReadClusterDataStage.java +++ b/helix-core/src/main/java/org/apache/helix/controller/stages/ReadClusterDataStage.java @@ -90,7 +90,7 @@ public void process(ClusterEvent event) throws Exception { instanceMessageMap.put(instanceName, Sets.newHashSet(dataProvider.getMessages(instanceName).values())); } - if (!InstanceValidationUtil.isInstanceEnabled(config, clusterConfig)) { + if (!config.getInstanceEnabled()) { disabledInstanceSet.add(instanceName); } diff --git a/helix-core/src/main/java/org/apache/helix/examples/IdealStateBuilderExample.java b/helix-core/src/main/java/org/apache/helix/examples/IdealStateBuilderExample.java index fd91588359..036a548bbb 100644 --- a/helix-core/src/main/java/org/apache/helix/examples/IdealStateBuilderExample.java +++ b/helix-core/src/main/java/org/apache/helix/examples/IdealStateBuilderExample.java @@ -19,6 +19,7 @@ * under the License. */ +import org.apache.helix.constants.InstanceConstants; import org.apache.helix.controller.HelixControllerMain; import org.apache.helix.manager.zk.ZKHelixAdmin; import org.apache.helix.manager.zk.ZNRecordSerializer; @@ -73,7 +74,7 @@ public static void main(String[] args) { InstanceConfig config = new InstanceConfig("localhost_" + port); config.setHostName("localhost"); config.setPort(Integer.toString(port)); - config.setInstanceEnabled(true); + config.setInstanceOperation(InstanceConstants.InstanceOperation.ENABLE); admin.addInstance(clusterName, config); } diff --git a/helix-core/src/main/java/org/apache/helix/examples/IdealStateExample.java b/helix-core/src/main/java/org/apache/helix/examples/IdealStateExample.java index 1f3939c7e4..fa5b7cd72a 100644 --- a/helix-core/src/main/java/org/apache/helix/examples/IdealStateExample.java +++ b/helix-core/src/main/java/org/apache/helix/examples/IdealStateExample.java @@ -19,6 +19,7 @@ * under the License. */ +import org.apache.helix.constants.InstanceConstants; import org.apache.helix.controller.HelixControllerMain; import org.apache.helix.manager.zk.ZKHelixAdmin; import org.apache.helix.manager.zk.ZNRecordSerializer; @@ -113,7 +114,7 @@ public static void main(String[] args) throws Exception { InstanceConfig config = new InstanceConfig("localhost_" + port); config.setHostName("localhost"); config.setPort(Integer.toString(port)); - config.setInstanceEnabled(true); + config.setInstanceOperation(InstanceConstants.InstanceOperation.ENABLE); admin.addInstance(clusterName, config); } diff --git a/helix-core/src/main/java/org/apache/helix/examples/Quickstart.java b/helix-core/src/main/java/org/apache/helix/examples/Quickstart.java index 9cc14b6039..37fd1ac150 100644 --- a/helix-core/src/main/java/org/apache/helix/examples/Quickstart.java +++ b/helix-core/src/main/java/org/apache/helix/examples/Quickstart.java @@ -29,6 +29,7 @@ import org.apache.helix.HelixManager; import org.apache.helix.HelixManagerFactory; import org.apache.helix.InstanceType; +import org.apache.helix.constants.InstanceConstants; import org.apache.helix.controller.HelixControllerMain; import org.apache.helix.manager.zk.ZKHelixAdmin; import org.apache.helix.model.ExternalView; @@ -69,7 +70,7 @@ public class Quickstart { InstanceConfig instanceConfig = new InstanceConfig("localhost_" + port); instanceConfig.setHostName("localhost"); instanceConfig.setPort("" + port); - instanceConfig.setInstanceEnabled(true); + instanceConfig.setInstanceOperation(InstanceConstants.InstanceOperation.ENABLE); INSTANCE_CONFIG_LIST.add(instanceConfig); } @@ -190,7 +191,7 @@ private static void addNode() throws Exception { InstanceConfig instanceConfig = new InstanceConfig("localhost_" + port); instanceConfig.setHostName("localhost"); instanceConfig.setPort("" + port); - instanceConfig.setInstanceEnabled(true); + instanceConfig.setInstanceOperation(InstanceConstants.InstanceOperation.ENABLE); echo("ADDING NEW NODE :" + instanceConfig.getInstanceName() + ". Partitions will move from old nodes to the new node."); admin.addInstance(CLUSTER_NAME, instanceConfig); diff --git a/helix-core/src/main/java/org/apache/helix/manager/zk/ZKHelixAdmin.java b/helix-core/src/main/java/org/apache/helix/manager/zk/ZKHelixAdmin.java index d52967f5c3..c7fe0861ba 100644 --- a/helix-core/src/main/java/org/apache/helix/manager/zk/ZKHelixAdmin.java +++ b/helix-core/src/main/java/org/apache/helix/manager/zk/ZKHelixAdmin.java @@ -61,7 +61,6 @@ import org.apache.helix.api.status.ClusterManagementModeRequest; import org.apache.helix.api.topology.ClusterTopology; import org.apache.helix.constants.InstanceConstants; -import org.apache.helix.controller.rebalancer.DelayedAutoRebalancer; import org.apache.helix.controller.rebalancer.strategy.RebalanceStrategy; import org.apache.helix.controller.rebalancer.util.WagedValidationUtil; import org.apache.helix.controller.rebalancer.waged.WagedRebalancer; @@ -119,10 +118,10 @@ public class ZKHelixAdmin implements HelixAdmin { public static final String CONNECTION_TIMEOUT = "helixAdmin.timeOutInSec"; private static final String MAINTENANCE_ZNODE_ID = "maintenance"; private static final int DEFAULT_SUPERCLUSTER_REPLICA = 3; - private static final ImmutableSet ALLOWED_INSTANCE_OPERATIONS_FOR_ADD_INSTANCE = - ImmutableSet.of("", InstanceConstants.InstanceOperation.SWAP_IN.name()); - private static final ImmutableSet INSTANCE_OPERATION_TO_EXCLUDE_FROM_ASSIGNMENT = - ImmutableSet.of(InstanceConstants.InstanceOperation.EVACUATE.name()); + private static final ImmutableSet + INSTANCE_OPERATION_TO_EXCLUDE_FROM_ASSIGNMENT = + ImmutableSet.of(InstanceConstants.InstanceOperation.EVACUATE, + InstanceConstants.InstanceOperation.UNKNOWN); private final RealmAwareZkClient _zkClient; private final ConfigAccessor _configAccessor; @@ -206,113 +205,29 @@ public void addInstance(String clusterName, InstanceConfig instanceConfig) { throw new HelixException("Node " + nodeId + " already exists in cluster " + clusterName); } - if (!ALLOWED_INSTANCE_OPERATIONS_FOR_ADD_INSTANCE.contains( - instanceConfig.getInstanceOperation())) { + List matchingLogicalIdInstances = + findInstancesMatchingLogicalId(clusterName, instanceConfig); + if (matchingLogicalIdInstances.size() > 1) { throw new HelixException( - "Instance can only be added if InstanceOperation is set to one of" + "the following: " - + ALLOWED_INSTANCE_OPERATIONS_FOR_ADD_INSTANCE + " This instance: " + nodeId - + " has InstanceOperation set to " + instanceConfig.getInstanceOperation()); + "There are already more than one instance with the same logicalId in the cluster: " + + matchingLogicalIdInstances.stream().map(InstanceConfig::getInstanceName) + .collect(Collectors.joining(", ")) + + " Please make sure there is at most 2 instance with the same logicalId in the cluster."); } - // Get the topology key used to determine the logicalId of a node. - ClusterConfig clusterConfig = _configAccessor.getClusterConfig(clusterName); - ClusterTopologyConfig clusterTopologyConfig = - ClusterTopologyConfig.createFromClusterConfig(clusterConfig); - String logicalIdKey = clusterTopologyConfig.getEndNodeType(); - String faultZoneKey = clusterTopologyConfig.getFaultZoneType(); - String toAddInstanceLogicalId = instanceConfig.getLogicalId(logicalIdKey); - - HelixConfigScope instanceConfigScope = - new HelixConfigScopeBuilder(HelixConfigScope.ConfigScopeProperty.PARTICIPANT, - clusterName).build(); - List existingInstanceIds = getConfigKeys(instanceConfigScope); - List foundInstanceConfigsWithMatchingLogicalId = - existingInstanceIds.parallelStream() - .map(existingInstanceId -> getInstanceConfig(clusterName, existingInstanceId)).filter( - existingInstanceConfig -> existingInstanceConfig.getLogicalId(logicalIdKey) - .equals(toAddInstanceLogicalId)).collect(Collectors.toList()); - - if (foundInstanceConfigsWithMatchingLogicalId.size() >= 2) { - // If the length is 2, we cannot add an instance with the same logicalId as an existing instance - // regardless of InstanceOperation. - throw new HelixException( - "There can only be 2 instances with the same logicalId in a cluster. " - + "Existing instances: " + foundInstanceConfigsWithMatchingLogicalId.get(0) - .getInstanceName() + " and " + foundInstanceConfigsWithMatchingLogicalId.get(1) - .getInstanceName() + " already have the same logicalId: " + toAddInstanceLogicalId - + "; therefore, " + nodeId + " cannot be added to the cluster."); - } else if (foundInstanceConfigsWithMatchingLogicalId.size() == 1) { - // If there is only one instance with the same logicalId, - // we can infer that the intended behaviour is to SWAP_IN or EVACUATE + ADD. - if (foundInstanceConfigsWithMatchingLogicalId.get(0).getInstanceOperation() - .equals(InstanceConstants.InstanceOperation.SWAP_OUT.name())) { - // If the existing instance with the same logicalId has SWAP_OUT InstanceOperation - - // If the InstanceOperation is unset, we will set it to SWAP_IN. - if (!instanceConfig.getInstanceOperation() - .equals(InstanceConstants.InstanceOperation.SWAP_IN.name())) { - instanceConfig.setInstanceOperation(InstanceConstants.InstanceOperation.SWAP_IN); - } - - // If the existing instance with the same logicalId is not in the same FAULT_ZONE as this instance, we cannot - // add this instance. - if (!foundInstanceConfigsWithMatchingLogicalId.get(0).getDomainAsMap() - .containsKey(faultZoneKey) || !instanceConfig.getDomainAsMap().containsKey(faultZoneKey) - || !foundInstanceConfigsWithMatchingLogicalId.get(0).getDomainAsMap().get(faultZoneKey) - .equals(instanceConfig.getDomainAsMap().get(faultZoneKey))) { - throw new HelixException( - "Instance can only be added if the SWAP_OUT instance sharing the same logicalId is in the same FAULT_ZONE" - + " as this instance. " + "Existing instance: " - + foundInstanceConfigsWithMatchingLogicalId.get(0).getInstanceName() - + " has FAULT_ZONE_TYPE: " + foundInstanceConfigsWithMatchingLogicalId.get(0) - .getDomainAsMap().get(faultZoneKey) + " and this instance: " + nodeId - + " has FAULT_ZONE_TYPE: " + instanceConfig.getDomainAsMap().get(faultZoneKey)); - } - - Map foundInstanceCapacityMap = - foundInstanceConfigsWithMatchingLogicalId.get(0).getInstanceCapacityMap().isEmpty() - ? clusterConfig.getDefaultInstanceCapacityMap() - : foundInstanceConfigsWithMatchingLogicalId.get(0).getInstanceCapacityMap(); - Map instanceCapacityMap = instanceConfig.getInstanceCapacityMap().isEmpty() - ? clusterConfig.getDefaultInstanceCapacityMap() - : instanceConfig.getInstanceCapacityMap(); - // If the instance does not have the same capacity, we cannot add this instance. - if (!new EqualsBuilder().append(foundInstanceCapacityMap, instanceCapacityMap).isEquals()) { - throw new HelixException( - "Instance can only be added if the SWAP_OUT instance sharing the same logicalId has the same capacity" - + " as this instance. " + "Existing instance: " - + foundInstanceConfigsWithMatchingLogicalId.get(0).getInstanceName() - + " has capacity: " + foundInstanceCapacityMap + " and this instance: " + nodeId - + " has capacity: " + instanceCapacityMap); - } - } else if (foundInstanceConfigsWithMatchingLogicalId.get(0).getInstanceOperation() - .equals(InstanceConstants.InstanceOperation.EVACUATE.name())) { - // No need to check anything on the new node, the old node will be evacuated and the new node - // will be added. - } else { - // If the instanceConfig.getInstanceEnabled() is true and the existing instance with the same logicalId - // does not have InstanceOperation set to one of the above, we cannot add this instance. - throw new HelixException( - "Instance can only be added if the exising instance sharing the same logicalId" - + " has InstanceOperation set to " - + InstanceConstants.InstanceOperation.SWAP_OUT.name() - + " and this instance has InstanceOperation set to " - + InstanceConstants.InstanceOperation.SWAP_IN.name() - + " or the existing instance sharing the same logicalId has Instance Operation set to " - + InstanceConstants.InstanceOperation.EVACUATE.name() - + " and this instance has InstanceOperation unset. Existing instance: " - + foundInstanceConfigsWithMatchingLogicalId.get(0).getInstanceName() - + " has InstanceOperation: " + foundInstanceConfigsWithMatchingLogicalId.get(0) - .getInstanceOperation()); - } - } else if (!instanceConfig.getInstanceOperation().isEmpty()) { - // If there are no instances with the same logicalId, we can only add this instance if InstanceOperation - // is unset because it is a new instance. - throw new HelixException( - "There is no instance with logicalId: " + toAddInstanceLogicalId + " in cluster: " - + clusterName + "; therefore, " + nodeId - + " cannot join cluster with InstanceOperation set to " - + instanceConfig.getInstanceOperation() + "."); + InstanceConstants.InstanceOperation attemptedInstanceOperation = + instanceConfig.getInstanceOperation(); + try { + validateInstanceOperationTransition(instanceConfig, + !matchingLogicalIdInstances.isEmpty() ? matchingLogicalIdInstances.get(0) : null, + InstanceConstants.InstanceOperation.UNKNOWN, + attemptedInstanceOperation, clusterName); + } catch (HelixException e) { + instanceConfig.setInstanceOperation(InstanceConstants.InstanceOperation.UNKNOWN); + logger.error("Failed to add instance " + instanceConfig.getInstanceName() + " to cluster " + + clusterName + " with instance operation " + attemptedInstanceOperation + + ". Setting INSTANCE_OPERATION to " + instanceConfig.getInstanceOperation() + + " instead.", e); } ZKUtil.createChildren(_zkClient, instanceConfigsPath, instanceConfig.getRecord()); @@ -464,12 +379,14 @@ public boolean setInstanceConfig(String clusterName, String instanceName, return accessor.setProperty(instanceConfigPropertyKey, newInstanceConfig); } + @Deprecated @Override public void enableInstance(final String clusterName, final String instanceName, final boolean enabled) { enableInstance(clusterName, instanceName, enabled, null, null); } + @Deprecated @Override public void enableInstance(final String clusterName, final String instanceName, final boolean enabled, InstanceConstants.InstanceDisabledType disabledType, String reason) { @@ -477,20 +394,6 @@ public void enableInstance(final String clusterName, final String instanceName, clusterName); BaseDataAccessor baseAccessor = new ZkBaseDataAccessor<>(_zkClient); - // If enabled is set to true and InstanceOperation is SWAP_IN, we should fail if there is not a - // matching SWAP_OUT instance. - InstanceConfig instanceConfig = getInstanceConfig(clusterName, instanceName); - if (enabled && instanceConfig.getInstanceOperation() - .equals(InstanceConstants.InstanceOperation.SWAP_IN.name())) { - InstanceConfig matchingSwapInstance = findMatchingSwapInstance(clusterName, instanceConfig); - if (matchingSwapInstance == null || !matchingSwapInstance.getInstanceOperation() - .equals(InstanceConstants.InstanceOperation.SWAP_OUT.name())) { - throw new HelixException("Instance cannot be enabled if InstanceOperation is set to " - + instanceConfig.getInstanceOperation() + " when there is no matching " - + InstanceConstants.InstanceOperation.SWAP_OUT.name() + " instance."); - } - } - // Eventually we will have all instances' enable/disable information in clusterConfig. Now we // update both instanceConfig and clusterConfig in transition period. enableSingleInstance(clusterName, instanceName, enabled, baseAccessor, disabledType, reason); @@ -499,6 +402,7 @@ public void enableInstance(final String clusterName, final String instanceName, } + @Deprecated @Override public void enableInstance(String clusterName, List instances, boolean enabled) { // TODO: batch enable/disable is breaking backward compatibility on instance enable with older library @@ -509,62 +413,88 @@ public void enableInstance(String clusterName, List instances, boolean e //enableInstance(clusterName, instances, enabled, null, null); } + private void validateInstanceOperationTransition(InstanceConfig instanceConfig, + InstanceConfig matchingLogicalIdInstance, + InstanceConstants.InstanceOperation currentOperation, + InstanceConstants.InstanceOperation targetOperation, + String clusterName) { + boolean targetStateEnableOrDisable = + targetOperation.equals(InstanceConstants.InstanceOperation.ENABLE) + || targetOperation.equals(InstanceConstants.InstanceOperation.DISABLE); + switch (currentOperation) { + case ENABLE: + case DISABLE: + // ENABLE or DISABLE can be set to ENABLE, DISABLE, or EVACUATE at any time. + if (ImmutableSet.of(InstanceConstants.InstanceOperation.ENABLE, + InstanceConstants.InstanceOperation.DISABLE, + InstanceConstants.InstanceOperation.EVACUATE).contains(targetOperation)) { + return; + } + case SWAP_IN: + // We can only ENABLE or DISABLE a SWAP_IN instance if there is an instance with matching logicalId + // with an InstanceOperation set to UNKNOWN. + if ((targetStateEnableOrDisable && (matchingLogicalIdInstance == null + || matchingLogicalIdInstance.getInstanceOperation() + .equals(InstanceConstants.InstanceOperation.UNKNOWN))) || targetOperation.equals( + InstanceConstants.InstanceOperation.UNKNOWN)) { + return; + } + case EVACUATE: + // EVACUATE can only be set to ENABLE or DISABLE when there is no instance with the same + // logicalId in the cluster. + if ((targetStateEnableOrDisable && matchingLogicalIdInstance == null) + || targetOperation.equals(InstanceConstants.InstanceOperation.UNKNOWN)) { + return; + } + case UNKNOWN: + // UNKNOWN can be set to ENABLE or DISABLE when there is no instance with the same logicalId in the cluster + // or the instance with the same logicalId in the cluster has InstanceOperation set to EVACUATE. + // UNKNOWN can be set to SWAP_IN when there is an instance with the same logicalId in the cluster set to ENABLE, + // or DISABLE. + if ((targetStateEnableOrDisable && (matchingLogicalIdInstance == null + || matchingLogicalIdInstance.getInstanceOperation() + .equals(InstanceConstants.InstanceOperation.EVACUATE)))) { + return; + } else if (targetOperation.equals(InstanceConstants.InstanceOperation.SWAP_IN) + && matchingLogicalIdInstance != null && !ImmutableSet.of( + InstanceConstants.InstanceOperation.UNKNOWN, + InstanceConstants.InstanceOperation.EVACUATE) + .contains(matchingLogicalIdInstance.getInstanceOperation())) { + return; + } + default: + throw new HelixException( + "InstanceOperation cannot be set to " + targetOperation + " when the instance is in " + + currentOperation + " state"); + } + } + + /** + * Set the InstanceOperation of an instance in the cluster. + * + * @param clusterName The cluster name + * @param instanceName The instance name + * @param instanceOperation The instance operation + */ @Override - // TODO: Name may change in future public void setInstanceOperation(String clusterName, String instanceName, @Nullable InstanceConstants.InstanceOperation instanceOperation) { BaseDataAccessor baseAccessor = new ZkBaseDataAccessor<>(_zkClient); String path = PropertyPathBuilder.instanceConfig(clusterName, instanceName); - // InstanceOperation can only be set to SWAP_IN when the instance is added to the cluster - // or if it is disabled. - if (instanceOperation != null && instanceOperation.equals( - InstanceConstants.InstanceOperation.SWAP_IN) && getInstanceConfig(clusterName, - instanceName).getInstanceEnabled()) { - throw new HelixException("InstanceOperation should only be set to " - + InstanceConstants.InstanceOperation.SWAP_IN.name() - + " when an instance joins the cluster for the first time(when " - + "creating the InstanceConfig) or is disabled."); - } - - // InstanceOperation cannot be set to null if there is an instance with the same logicalId in - // the cluster which does not have InstanceOperation set to SWAP_IN or SWAP_OUT. - if (instanceOperation == null) { - InstanceConfig instanceConfig = getInstanceConfig(clusterName, instanceName); - String logicalIdKey = ClusterTopologyConfig.createFromClusterConfig( - _configAccessor.getClusterConfig(clusterName)).getEndNodeType(); - - HelixConfigScope instanceConfigScope = - new HelixConfigScopeBuilder(HelixConfigScope.ConfigScopeProperty.PARTICIPANT, - clusterName).build(); - List existingInstanceIds = getConfigKeys(instanceConfigScope); - List matchingInstancesWithNonSwappingInstanceOperation = - existingInstanceIds.parallelStream() - .map(existingInstanceId -> getInstanceConfig(clusterName, existingInstanceId)).filter( - existingInstanceConfig -> - !existingInstanceConfig.getInstanceName().equals(instanceName) - && existingInstanceConfig.getLogicalId(logicalIdKey) - .equals(instanceConfig.getLogicalId(logicalIdKey)) - && !existingInstanceConfig.getInstanceOperation() - .equals(InstanceConstants.InstanceOperation.SWAP_IN.name()) - && !existingInstanceConfig.getInstanceOperation() - .equals(InstanceConstants.InstanceOperation.SWAP_OUT.name()) - && !existingInstanceConfig.getInstanceOperation() - .equals(InstanceConstants.InstanceOperation.EVACUATE.name())) - .collect(Collectors.toList()); - - if (!matchingInstancesWithNonSwappingInstanceOperation.isEmpty()) { - throw new HelixException("InstanceOperation cannot be set to null for " + instanceName - + " if there are other instances with the same logicalId in the cluster that do not have" - + " InstanceOperation set to SWAP_IN, SWAP_OUT, or EVACUATE."); - } - } - - if (!baseAccessor.exists(path, 0)) { - throw new HelixException( - "Cluster " + clusterName + ", instance: " + instanceName + ", instance config does not exist"); + InstanceConfig instanceConfig = getInstanceConfig(clusterName, instanceName); + if (instanceConfig == null) { + throw new HelixException("Cluster " + clusterName + ", instance: " + instanceName + + ", instance config does not exist"); } + List matchingLogicalIdInstances = + findInstancesMatchingLogicalId(clusterName, instanceConfig); + validateInstanceOperationTransition(instanceConfig, + !matchingLogicalIdInstances.isEmpty() ? matchingLogicalIdInstances.get(0) : null, + instanceConfig.getInstanceOperation(), + instanceOperation == null ? InstanceConstants.InstanceOperation.ENABLE : instanceOperation, + clusterName); boolean succeeded = baseAccessor.update(path, new DataUpdater() { @Override @@ -589,50 +519,33 @@ public ZNRecord update(ZNRecord currentData) { public boolean isEvacuateFinished(String clusterName, String instanceName) { if (!instanceHasFullAutoCurrentStateOrMessage(clusterName, instanceName)) { InstanceConfig config = getInstanceConfig(clusterName, instanceName); - return config != null && config.getInstanceOperation().equals(InstanceConstants.InstanceOperation.EVACUATE.name()); + return config != null && config.getInstanceOperation() + .equals(InstanceConstants.InstanceOperation.EVACUATE); } return false; } /** - * Find the instance that the passed instance is swapping with. If the passed instance has - * SWAP_OUT instanceOperation, then find the corresponding instance that has SWAP_IN - * instanceOperation. If the passed instance has SWAP_IN instanceOperation, then find the - * corresponding instance that has SWAP_OUT instanceOperation. + * Find the instance that the passed instance has a matching logicalId with. * * @param clusterName The cluster name - * @param instanceConfig The instance to find the swap instance for - * @return The swap instance if found, null otherwise. + * @param instanceConfig The instance to find the matching instance for + * @return The matching instance if found, null otherwise. */ - @Nullable - private InstanceConfig findMatchingSwapInstance(String clusterName, + private List findInstancesMatchingLogicalId(String clusterName, InstanceConfig instanceConfig) { String logicalIdKey = ClusterTopologyConfig.createFromClusterConfig(_configAccessor.getClusterConfig(clusterName)) .getEndNodeType(); - - for (String potentialSwappingInstance : getConfigKeys( + return getConfigKeys( new HelixConfigScopeBuilder(HelixConfigScope.ConfigScopeProperty.PARTICIPANT, - clusterName).build())) { - InstanceConfig potentialSwappingInstanceConfig = - getInstanceConfig(clusterName, potentialSwappingInstance); - - // Return if there is a matching Instance with the same logicalId and opposite InstanceOperation swap operation. - if (potentialSwappingInstanceConfig.getLogicalId(logicalIdKey) - .equals(instanceConfig.getLogicalId(logicalIdKey)) && ( - instanceConfig.getInstanceOperation() - .equals(InstanceConstants.InstanceOperation.SWAP_IN.name()) - && potentialSwappingInstanceConfig.getInstanceOperation() - .equals(InstanceConstants.InstanceOperation.SWAP_OUT.name())) || ( - instanceConfig.getInstanceOperation() - .equals(InstanceConstants.InstanceOperation.SWAP_OUT.name()) - && potentialSwappingInstanceConfig.getInstanceOperation() - .equals(InstanceConstants.InstanceOperation.SWAP_IN.name()))) { - return potentialSwappingInstanceConfig; - } - } - - return null; + clusterName).build()).stream() + .map(instanceName -> getInstanceConfig(clusterName, instanceName)).filter( + potentialInstanceConfig -> + !potentialInstanceConfig.getInstanceName().equals(instanceConfig.getInstanceName()) + && potentialInstanceConfig.getLogicalId(logicalIdKey) + .equals(instanceConfig.getLogicalId(logicalIdKey))) + .collect(Collectors.toList()); } /** @@ -661,14 +574,13 @@ private boolean canCompleteSwap(String clusterName, String swapOutInstanceName, accessor.getProperty(keyBuilder.liveInstance(swapInInstanceName)); InstanceConfig swapOutInstanceConfig = getInstanceConfig(clusterName, swapOutInstanceName); InstanceConfig swapInInstanceConfig = getInstanceConfig(clusterName, swapInInstanceName); - if (swapInLiveInstance == null || !swapInInstanceConfig.getInstanceEnabled()) { + if (swapInLiveInstance == null) { logger.warn( - "SwapOutInstance {} is {} + {} and SwapInInstance {} is {} + {} for cluster {}. Swap will" - + " not complete unless SwapInInstance instance is ENABLED and ONLINE.", + "SwapOutInstance {} is {} + {} and SwapInInstance {} is OFFLINE + {} for cluster {}. Swap will" + + " not complete unless SwapInInstance instance is ONLINE.", swapOutInstanceName, swapOutLiveInstance != null ? "ONLINE" : "OFFLINE", - swapOutInstanceConfig.getInstanceEnabled() ? "ENABLED" : "DISABLED", swapInInstanceName, - swapInLiveInstance != null ? "ONLINE" : "OFFLINE", - swapInInstanceConfig.getInstanceEnabled() ? "ENABLED" : "DISABLED", clusterName); + swapOutInstanceConfig.getInstanceOperation(), swapInInstanceName, + swapInInstanceConfig.getInstanceOperation(), clusterName); return false; } @@ -705,21 +617,15 @@ private boolean canCompleteSwap(String clusterName, String swapOutInstanceName, return false; } - // 4. Collect a list of all partitions that have a current state on swapOutInstance - String swapOutLastActiveSession; - if (swapOutLiveInstance == null) { - // SwapOutInstance is down, try to find the last active session - if (swapOutSessions.size() != 1) { - logger.warn( - "SwapOutInstance {} is offline and has {} sessions for cluster {}. Swap can't be " - + "verified if last active session can't be determined. There should only be one session.", - swapOutInstanceName, swapOutSessions.size(), clusterName); - return false; - } - swapOutLastActiveSession = swapOutSessions.get(0); - } else { - swapOutLastActiveSession = swapOutLiveInstance.getEphemeralOwner(); + // 4. If the swap-out instance is not alive or is disabled, we return true without checking + // the current states on the swap-in instance. + if (swapOutLiveInstance == null || swapOutInstanceConfig.getInstanceOperation() + .equals(InstanceConstants.InstanceOperation.DISABLE)) { + return true; } + + // 5. Collect a list of all partitions that have a current state on swapOutInstance + String swapOutLastActiveSession = swapOutLiveInstance.getEphemeralOwner(); String swapInActiveSession = swapInLiveInstance.getEphemeralOwner(); // Iterate over all resources with current states on the swapOutInstance @@ -754,24 +660,22 @@ private boolean canCompleteSwap(String clusterName, String swapOutInstanceName, String swapOutPartitionState = swapOutResourceCurrentState.getState(partitionName); String swapInPartitionState = swapInResourceCurrentState.getState(partitionName); - // SwapInInstance should not have any partitions in ERROR state. - if (swapInPartitionState.equals(HelixDefinedState.ERROR.name())) { - logger.warn( - "SwapOutInstance {} has partition {} in state {} and SwapInInstance {} has partition {} in state {} for cluster {}." - + " Swap will not complete unless both instances have no partitions in ERROR state.", - swapOutInstanceName, partitionName, swapOutPartitionState, swapInInstanceName, - partitionName, swapInPartitionState, clusterName); - return false; - } - - // The state of the partition on the swapInInstance be in the topState or a secondTopState. - // It should be in a topState only if the state model allows multiple replicas in the topState. - // In all other cases it should be a secondTopState. - if (!(swapInPartitionState.equals(topState) || secondTopStates.contains( + // SwapInInstance should have the correct state for the partition. + // All states should match except for the case where the topState is not ALL_REPLICAS or ALL_CANDIDATE_NODES + // or the swap-out partition is ERROR state. + // When the topState is not ALL_REPLICAS or ALL_CANDIDATE_NODES, the swap-in partition should be in a secondTopStates. + if (!(swapOutPartitionState.equals(HelixDefinedState.ERROR.name()) || ( + topState.equals(swapOutPartitionState) && ( + swapOutPartitionState.equals(swapInPartitionState) || + !ImmutableSet.of(StateModelDefinition.STATE_REPLICA_COUNT_ALL_REPLICAS, + StateModelDefinition.STATE_REPLICA_COUNT_ALL_CANDIDATE_NODES).contains( + stateModelDefinition.getNumInstancesPerState( + stateModelDefinition.getTopState())) && secondTopStates.contains( + swapInPartitionState))) || swapOutPartitionState.equals( swapInPartitionState))) { logger.warn( "SwapOutInstance {} has partition {} in {} but SwapInInstance {} has partition {} in state {} for cluster {}." - + " Swap will not complete unless SwapInInstance has partition in topState or secondState.", + + " Swap will not complete unless SwapInInstance has partition in correct states.", swapOutInstanceName, partitionName, swapOutPartitionState, swapInInstanceName, partitionName, swapInPartitionState, clusterName); return false; @@ -792,12 +696,21 @@ public boolean canCompleteSwap(String clusterName, String instanceName) { return false; } - InstanceConfig swapOutInstanceConfig = instanceConfig.getInstanceOperation() - .equals(InstanceConstants.InstanceOperation.SWAP_OUT.name()) ? instanceConfig - : findMatchingSwapInstance(clusterName, instanceConfig); + List swappingInstances = + findInstancesMatchingLogicalId(clusterName, instanceConfig); + if (swappingInstances.size() != 1) { + logger.warn( + "Instance {} in cluster {} is not swapping with any other instance. Cannot determine if the swap is complete.", + instanceName, clusterName); + return false; + } + + InstanceConfig swapOutInstanceConfig = + !instanceConfig.getInstanceOperation().equals(InstanceConstants.InstanceOperation.SWAP_IN) + ? instanceConfig : swappingInstances.get(0); InstanceConfig swapInInstanceConfig = instanceConfig.getInstanceOperation() - .equals(InstanceConstants.InstanceOperation.SWAP_IN.name()) ? instanceConfig - : findMatchingSwapInstance(clusterName, instanceConfig); + .equals(InstanceConstants.InstanceOperation.SWAP_IN) ? instanceConfig + : swappingInstances.get(0); if (swapOutInstanceConfig == null || swapInInstanceConfig == null) { logger.warn( "Instance {} in cluster {} is not swapping with any other instance. Cannot determine if the swap is complete.", @@ -821,12 +734,21 @@ public boolean completeSwapIfPossible(String clusterName, String instanceName, return false; } - InstanceConfig swapOutInstanceConfig = instanceConfig.getInstanceOperation() - .equals(InstanceConstants.InstanceOperation.SWAP_OUT.name()) ? instanceConfig - : findMatchingSwapInstance(clusterName, instanceConfig); + List swappingInstances = + findInstancesMatchingLogicalId(clusterName, instanceConfig); + if (swappingInstances.size() != 1) { + logger.warn( + "Instance {} in cluster {} is not swapping with any other instance. Cannot determine if the swap is complete.", + instanceName, clusterName); + return false; + } + + InstanceConfig swapOutInstanceConfig = + !instanceConfig.getInstanceOperation().equals(InstanceConstants.InstanceOperation.SWAP_IN) + ? instanceConfig : swappingInstances.get(0); InstanceConfig swapInInstanceConfig = instanceConfig.getInstanceOperation() - .equals(InstanceConstants.InstanceOperation.SWAP_IN.name()) ? instanceConfig - : findMatchingSwapInstance(clusterName, instanceConfig); + .equals(InstanceConstants.InstanceOperation.SWAP_IN) ? instanceConfig + : swappingInstances.get(0); if (swapOutInstanceConfig == null || swapInInstanceConfig == null) { logger.warn( "Instance {} in cluster {} is not swapping with any other instance. Cannot determine if the swap is complete.", @@ -840,11 +762,39 @@ public boolean completeSwapIfPossible(String clusterName, String instanceName, return false; } - // Complete the swap by removing the InstanceOperation for the SWAP_IN node and disabling the SWAP_OUT node. - setInstanceOperation(clusterName, swapInInstanceConfig.getInstanceName(), null); - enableInstance(clusterName, swapOutInstanceConfig.getInstanceName(), false); + BaseDataAccessor baseAccessor = new ZkBaseDataAccessor<>(_zkClient); + String swapInInstanceConfigPath = + PropertyPathBuilder.instanceConfig(clusterName, swapInInstanceConfig.getInstanceName()); + String swapOutInstanceConfigPath = + PropertyPathBuilder.instanceConfig(clusterName, swapOutInstanceConfig.getInstanceName()); + + Map> updaterMap = new HashMap<>(); + updaterMap.put(swapInInstanceConfigPath, currentData -> { + if (currentData == null) { + throw new HelixException("Cluster: " + clusterName + ", instance: " + instanceName + + ", SWAP_IN instance config is null"); + } - return true; + InstanceConfig currentSwapOutInstanceConfig = + getInstanceConfig(clusterName, swapOutInstanceConfig.getInstanceName()); + InstanceConfig config = new InstanceConfig(currentData); + config.overwriteInstanceConfig(currentSwapOutInstanceConfig); + // Special handling in case the swap-out instance does not have HELIX_ENABLED or InstanceOperation set. + return config.getRecord(); + }); + + updaterMap.put(swapOutInstanceConfigPath, currentData -> { + if (currentData == null) { + throw new HelixException("Cluster: " + clusterName + ", instance: " + instanceName + + ", swap out instance config is null"); + } + + InstanceConfig config = new InstanceConfig(currentData); + config.setInstanceOperation(InstanceConstants.InstanceOperation.UNKNOWN); + return config.getRecord(); + }); + + return baseAccessor.multiSet(updaterMap); } @Override @@ -2427,6 +2377,7 @@ public void enableBatchMessageMode(String clusterName, String resourceName, bool setResourceIdealState(clusterName, resourceName, idealState); } + @Deprecated private void enableSingleInstance(final String clusterName, final String instanceName, final boolean enabled, BaseDataAccessor baseAccessor, InstanceConstants.InstanceDisabledType disabledType, String reason) { @@ -2448,7 +2399,7 @@ public ZNRecord update(ZNRecord currentData) { InstanceConfig config = new InstanceConfig(currentData); config.setInstanceEnabled(enabled); if (!enabled) { - // new disabled type and reason will over write existing ones. + // new disabled type and reason will overwrite existing ones. config.resetInstanceDisabledTypeAndReason(); if (reason != null) { config.setInstanceDisabledReason(reason); diff --git a/helix-core/src/main/java/org/apache/helix/model/ClusterConfig.java b/helix-core/src/main/java/org/apache/helix/model/ClusterConfig.java index fdc05ac9bb..edb7a76c6d 100644 --- a/helix-core/src/main/java/org/apache/helix/model/ClusterConfig.java +++ b/helix-core/src/main/java/org/apache/helix/model/ClusterConfig.java @@ -75,6 +75,8 @@ public enum ClusterConfigProperty { // TODO: if we want to support this for other rebalancers, we need to implement that logic GLOBAL_MAX_PARTITIONS_ALLOWED_PER_INSTANCE, // The following two include offline AND disabled instances + // TODO: At some point we should rename this to something like MAX_INSTANCES_UNABLE_TO_TAKE_ACCEPT_REPLICAS + // to make it clear that it includes both offline and non-assignable instances MAX_OFFLINE_INSTANCES_ALLOWED, NUM_OFFLINE_INSTANCES_FOR_AUTO_EXIT, // For auto-exiting maintenance mode @@ -88,7 +90,9 @@ public enum ClusterConfigProperty { // state transition if the number of // partitons that need recovery or in // error exceeds this limitation + @Deprecated // TODO: Remove in Helix 2.0 DISABLED_INSTANCES, + @Deprecated // TODO: Remove in Helix 2.0 DISABLED_INSTANCES_WITH_INFO, // disabled instances and disabled instances with info are for storing batch disabled instances. // disabled instances will write into both 2 fields for backward compatibility. @@ -816,8 +820,11 @@ public void setDisabledInstancesWithInfo(Map disabledInstancesWi /** * Get current disabled instance map of + * @deprecated We will no longer be using the clusterConfig to disable instances + * please use the InstanceConfig to disable instances * @return a non-null map of disabled instances in cluster config */ + @Deprecated public Map getDisabledInstances() { Map disabledInstances = _record.getMapField(ClusterConfigProperty.DISABLED_INSTANCES.name()); @@ -827,8 +834,10 @@ public Map getDisabledInstances() { /** * Get current disabled instance map of * + * @deprecated Please use InstanceConfig for enabling and disabling instances * @return a non-null map of disabled instances in cluster config */ + @Deprecated public Map getDisabledInstancesWithInfo() { Map disabledInstances = _record.getMapField(ClusterConfigProperty.DISABLED_INSTANCES_WITH_INFO.name()); diff --git a/helix-core/src/main/java/org/apache/helix/model/InstanceConfig.java b/helix-core/src/main/java/org/apache/helix/model/InstanceConfig.java index 2f3da14569..de41646c39 100644 --- a/helix-core/src/main/java/org/apache/helix/model/InstanceConfig.java +++ b/helix-core/src/main/java/org/apache/helix/model/InstanceConfig.java @@ -29,6 +29,7 @@ import java.util.Set; import java.util.stream.Collectors; +import com.google.common.collect.ImmutableSet; import org.apache.helix.HelixException; import org.apache.helix.HelixProperty; import org.apache.helix.constants.InstanceConstants; @@ -47,9 +48,7 @@ public class InstanceConfig extends HelixProperty { * Configurable characteristics of an instance */ public enum InstanceConfigProperty { - HELIX_HOST, - HELIX_PORT, - HELIX_ZONE_ID, + HELIX_HOST, HELIX_PORT, HELIX_ZONE_ID, @Deprecated HELIX_ENABLED, HELIX_ENABLED_TIMESTAMP, HELIX_DISABLED_REASON, @@ -71,6 +70,13 @@ public enum InstanceConfigProperty { private static final int TARGET_TASK_THREAD_POOL_SIZE_NOT_SET = -1; private static final boolean HELIX_ENABLED_DEFAULT_VALUE = true; + // These fields are not allowed to be overwritten by the merge method because + // they are unique properties of an instance. + private static final ImmutableSet NON_OVERWRITABLE_PROPERTIES = + ImmutableSet.of(InstanceConfigProperty.HELIX_HOST, InstanceConfigProperty.HELIX_PORT, + InstanceConfigProperty.HELIX_ZONE_ID, InstanceConfigProperty.DOMAIN, + InstanceConfigProperty.INSTANCE_INFO_MAP); + private static final Logger _logger = LoggerFactory.getLogger(InstanceConfig.class.getName()); /** @@ -252,20 +258,22 @@ public boolean containsTag(String tag) { } /** - * Check if this instance is enabled and able to serve replicas - * @return true if enabled, false if disabled + * Get the timestamp (milliseconds from epoch) when this instance was enabled/disabled last time. + * + * @return the timestamp when the instance was enabled/disabled last time. If the instance is never + * enabled/disabled, return -1. */ - public boolean getInstanceEnabled() { - return _record.getBooleanField(InstanceConfigProperty.HELIX_ENABLED.name(), - HELIX_ENABLED_DEFAULT_VALUE); + public long getInstanceEnabledTime() { + return _record.getLongField(InstanceConfigProperty.HELIX_ENABLED_TIMESTAMP.name(), -1); } /** - * Set the enabled state of the instance - * If user enables the instance, HELIX_DISABLED_REASON filed will be removed. - * + * Set the enabled state of the instance If user enables the instance, HELIX_DISABLED_REASON filed + * will be removed. + * @deprecated This method is deprecated. Please use setInstanceOperation instead. * @param enabled true to enable, false to disable */ + @Deprecated public void setInstanceEnabled(boolean enabled) { // set instance operation only when we need to change InstanceEnabled value. setInstanceEnabledHelper(enabled); @@ -292,7 +300,7 @@ public void resetInstanceDisabledTypeAndReason() { * It will be a no-op when instance is enabled. */ public void setInstanceDisabledReason(String disabledReason) { - if (!getInstanceEnabled()) { + if (getInstanceOperation().equals(InstanceConstants.InstanceOperation.DISABLE)) { _record.setSimpleField(InstanceConfigProperty.HELIX_DISABLED_REASON.name(), disabledReason); } } @@ -302,13 +310,14 @@ public void setInstanceDisabledReason(String disabledReason) { * It will be a no-op when instance is enabled. */ public void setInstanceDisabledType(InstanceConstants.InstanceDisabledType disabledType) { - if (!getInstanceEnabled()) { + if (getInstanceOperation().equals(InstanceConstants.InstanceOperation.DISABLE)) { _record.setSimpleField(InstanceConfigProperty.HELIX_DISABLED_TYPE.name(), disabledType.name()); } } /** + * Get the instance disabled reason when instance is disabled. * @return Return instance disabled reason. Default is am empty string. */ public String getInstanceDisabledReason() { @@ -321,7 +330,7 @@ public String getInstanceDisabledReason() { * Default is am empty string. */ public String getInstanceDisabledType() { - if (getInstanceEnabled()) { + if (!getInstanceOperation().equals(InstanceConstants.InstanceOperation.DISABLE)) { return InstanceConstants.INSTANCE_NOT_DISABLED; } return _record.getStringField(InstanceConfigProperty.HELIX_DISABLED_TYPE.name(), @@ -329,21 +338,85 @@ public String getInstanceDisabledType() { } /** - * Get the timestamp (milliseconds from epoch) when this instance was enabled/disabled last time. + * Set the instance operation for this instance. * - * @return + * @param operation the instance operation */ - public long getInstanceEnabledTime() { - return _record.getLongField(InstanceConfigProperty.HELIX_ENABLED_TIMESTAMP.name(), -1); - } - public void setInstanceOperation(InstanceConstants.InstanceOperation operation) { _record.setSimpleField(InstanceConfigProperty.INSTANCE_OPERATION.name(), operation == null ? "" : operation.name()); + if (operation == null || operation == InstanceConstants.InstanceOperation.ENABLE + || operation == InstanceConstants.InstanceOperation.DISABLE) { + // We are still setting the HELIX_ENABLED field for backwards compatibility. + // It is possible that users will be using earlier version of HelixAdmin or helix-rest + // is on older version. + // TODO: Remove this when we are sure that all users are using the new field INSTANCE_OPERATION. + setInstanceEnabledHelper(!(operation == InstanceConstants.InstanceOperation.DISABLE)); + } + } + + private void setInstanceOperationInit(InstanceConstants.InstanceOperation operation) { + if (operation == null) { + return; + } + _record.setSimpleField(InstanceConfigProperty.INSTANCE_OPERATION.name(), operation.name()); + } + + /** + * Get the InstanceOperation of this instance, default is ENABLE if nothing is set. If + * HELIX_ENABLED is set to false, then the instance operation is DISABLE for backwards + * compatibility. + * + * @return the instance operation + */ + public InstanceConstants.InstanceOperation getInstanceOperation() { + String instanceOperationString = + _record.getSimpleField(InstanceConfigProperty.INSTANCE_OPERATION.name()); + + InstanceConstants.InstanceOperation instanceOperation; + try { + // If INSTANCE_OPERATION is not set, then the instance is enabled. + instanceOperation = (instanceOperationString == null || instanceOperationString.isEmpty()) + ? InstanceConstants.InstanceOperation.ENABLE + : InstanceConstants.InstanceOperation.valueOf(instanceOperationString); + } catch (IllegalArgumentException e) { + _logger.error("Invalid instance operation: " + instanceOperationString + " for instance: " + + _record.getId() + + ". You may need to update your version of Helix to get support for this " + + "type of InstanceOperation. Defaulting to UNKNOWN."); + return InstanceConstants.InstanceOperation.UNKNOWN; + } + + // Always respect the HELIX_ENABLED being set to false when instance operation is unset + // for backwards compatibility. + if (!_record.getBooleanField(InstanceConfigProperty.HELIX_ENABLED.name(), + HELIX_ENABLED_DEFAULT_VALUE) + && (InstanceConstants.INSTANCE_DISABLED_OVERRIDABLE_OPERATIONS.contains( + instanceOperation))) { + return InstanceConstants.InstanceOperation.DISABLE; + } + + return instanceOperation; } - public String getInstanceOperation() { - return _record.getStringField(InstanceConfigProperty.INSTANCE_OPERATION.name(), ""); + /** + * Check if this instance is enabled. This is used to determine if the instance can host online + * replicas and take new assignment. + * + * @return true if enabled, false otherwise + */ + public boolean getInstanceEnabled() { + return getInstanceOperation().equals(InstanceConstants.InstanceOperation.ENABLE); + } + + /** + * Check to see if the instance is assignable. This is used to determine if the instance can be + * selected by the rebalancer to take assignment of replicas. + * + * @return true if the instance is assignable, false otherwise + */ + public boolean isAssignable() { + return InstanceConstants.ASSIGNABLE_INSTANCE_OPERATIONS.contains(getInstanceOperation()); } /** @@ -777,6 +850,34 @@ public boolean validateTopologySettingInInstanceConfig(ClusterConfig clusterConf return true; } + /** + * Overwrite the InstanceConfigProperties from the given InstanceConfig to this InstanceConfig. + * The merge is done by overwriting the properties in this InstanceConfig with the properties + * from the given InstanceConfig. {@link #NON_OVERWRITABLE_PROPERTIES} will not be overridden. + * + * @param overwritingInstanceConfig the InstanceConfig to override into this InstanceConfig + */ + public void overwriteInstanceConfig(InstanceConfig overwritingInstanceConfig) { + // Remove all overwritable fields from the record + Set overwritableProperties = Arrays.stream(InstanceConfigProperty.values()) + .filter(property -> !NON_OVERWRITABLE_PROPERTIES.contains(property)).map(Enum::name) + .collect(Collectors.toSet()); + _record.getSimpleFields().keySet().removeAll(overwritableProperties); + _record.getListFields().keySet().removeAll(overwritableProperties); + _record.getMapFields().keySet().removeAll(overwritableProperties); + + // Get all overwritable fields from the overwritingInstanceConfig and set them in this record + overwritingInstanceConfig.getRecord().getSimpleFields().entrySet().stream() + .filter(entry -> overwritableProperties.contains(entry.getKey())) + .forEach((entry) -> _record.setSimpleField(entry.getKey(), entry.getValue())); + overwritingInstanceConfig.getRecord().getListFields().entrySet().stream() + .filter(entry -> overwritableProperties.contains(entry.getKey())) + .forEach((entry) -> _record.setListField(entry.getKey(), entry.getValue())); + overwritingInstanceConfig.getRecord().getMapFields().entrySet().stream() + .filter(entry -> overwritableProperties.contains(entry.getKey())) + .forEach((entry) -> _record.setMapField(entry.getKey(), entry.getValue())); + } + public static class Builder { private String _hostName; private String _port; @@ -828,12 +929,15 @@ public InstanceConfig build(String instanceId) { instanceConfig.addTag(tag); } - if (_instanceEnabled != HELIX_ENABLED_DEFAULT_VALUE) { - instanceConfig.setInstanceEnabled(_instanceEnabled); + if (_instanceOperation == null && _instanceEnabled != HELIX_ENABLED_DEFAULT_VALUE) { + instanceConfig.setInstanceOperationInit( + _instanceEnabled ? InstanceConstants.InstanceOperation.ENABLE + : InstanceConstants.InstanceOperation.DISABLE); } - if (_instanceOperation != null) { - instanceConfig.setInstanceOperation(_instanceOperation); + if (_instanceOperation != null && !_instanceOperation.equals( + InstanceConstants.InstanceOperation.ENABLE)) { + instanceConfig.setInstanceOperationInit(_instanceOperation); } if (_instanceInfoMap != null) { @@ -899,9 +1003,11 @@ public Builder addTag(String tag) { /** * Set the enabled status for this instance + * @deprecated HELIX_ENABLED is no longer in use. Use setInstanceOperation instead. * @param instanceEnabled true if enabled, false otherwise * @return InstanceConfig.Builder */ + @Deprecated public Builder setInstanceEnabled(boolean instanceEnabled) { _instanceEnabled = instanceEnabled; return this; diff --git a/helix-core/src/main/java/org/apache/helix/model/MaintenanceSignal.java b/helix-core/src/main/java/org/apache/helix/model/MaintenanceSignal.java index f978130b7b..83e0e1c604 100644 --- a/helix-core/src/main/java/org/apache/helix/model/MaintenanceSignal.java +++ b/helix-core/src/main/java/org/apache/helix/model/MaintenanceSignal.java @@ -50,7 +50,9 @@ public enum TriggeringEntity { * maintenance mode. This field does not apply when triggered manually. */ public enum AutoTriggerReason { + @Deprecated // Replaced with MAX_INSTANCES_UNABLE_TO_ACCEPT_ONLINE_REPLICAS MAX_OFFLINE_INSTANCES_EXCEEDED, + MAX_INSTANCES_UNABLE_TO_ACCEPT_ONLINE_REPLICAS, MAX_PARTITION_PER_INSTANCE_EXCEEDED, NOT_APPLICABLE // Not triggered automatically or automatically exiting maintenance mode } diff --git a/helix-core/src/main/java/org/apache/helix/spectator/RoutingDataCache.java b/helix-core/src/main/java/org/apache/helix/spectator/RoutingDataCache.java index 8872e9edac..db9ada93c4 100644 --- a/helix-core/src/main/java/org/apache/helix/spectator/RoutingDataCache.java +++ b/helix-core/src/main/java/org/apache/helix/spectator/RoutingDataCache.java @@ -26,7 +26,6 @@ import java.util.stream.Collectors; import com.google.common.collect.ImmutableMap; -import com.google.common.collect.ImmutableSet; import org.apache.helix.HelixConstants; import org.apache.helix.HelixDataAccessor; import org.apache.helix.HelixException; @@ -45,16 +44,13 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; + /** * Cache the cluster data that are needed by RoutingTableProvider. */ class RoutingDataCache extends BasicClusterDataCache { private static Logger LOG = LoggerFactory.getLogger(RoutingDataCache.class.getName()); - // When an instance has any of these instance operations, it should not be routable. - private static final ImmutableSet NON_ROUTABLE_INSTANCE_OPERATIONS = - ImmutableSet.of(InstanceConstants.InstanceOperation.SWAP_IN.name()); - private final Map> _sourceDataTypeMap; private CurrentStateCache _currentStateCache; @@ -185,7 +181,7 @@ public synchronized void refresh(HelixDataAccessor accessor) { private void updateRoutableInstanceConfigMap(Map instanceConfigMap) { _routableInstanceConfigMap = instanceConfigMap.entrySet().stream().filter( - (instanceConfigEntry) -> !NON_ROUTABLE_INSTANCE_OPERATIONS.contains( + (instanceConfigEntry) -> !InstanceConstants.UNSERVABLE_INSTANCE_OPERATIONS.contains( instanceConfigEntry.getValue().getInstanceOperation())) .collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue)); } @@ -194,7 +190,7 @@ private void updateRoutableLiveInstanceMap(Map instanceC Map liveInstanceMap) { _routableLiveInstanceMap = liveInstanceMap.entrySet().stream().filter( (liveInstanceEntry) -> instanceConfigMap.containsKey(liveInstanceEntry.getKey()) - && !NON_ROUTABLE_INSTANCE_OPERATIONS.contains( + && !InstanceConstants.UNSERVABLE_INSTANCE_OPERATIONS.contains( instanceConfigMap.get(liveInstanceEntry.getKey()).getInstanceOperation())) .collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue)); } diff --git a/helix-core/src/main/java/org/apache/helix/task/JobDispatcher.java b/helix-core/src/main/java/org/apache/helix/task/JobDispatcher.java index fd22a8e1fd..6d4c687fcc 100644 --- a/helix-core/src/main/java/org/apache/helix/task/JobDispatcher.java +++ b/helix-core/src/main/java/org/apache/helix/task/JobDispatcher.java @@ -141,8 +141,8 @@ public ResourceAssignment processJobStatusUpdateAndAssignment(String jobName, // Will contain the list of partitions that must be explicitly dropped from the ideal state that // is stored in zk. Set liveInstances = - jobCfg.getInstanceGroupTag() == null ? _dataProvider.getAssignableEnabledLiveInstances() - : _dataProvider.getAssignableEnabledLiveInstancesWithTag(jobCfg.getInstanceGroupTag()); + jobCfg.getInstanceGroupTag() == null ? _dataProvider.getEnabledLiveInstances() + : _dataProvider.getEnabledLiveInstancesWithTag(jobCfg.getInstanceGroupTag()); if (liveInstances.isEmpty()) { LOG.error("No available instance found for job: {}", jobName); diff --git a/helix-core/src/main/java/org/apache/helix/tools/ClusterSetup.java b/helix-core/src/main/java/org/apache/helix/tools/ClusterSetup.java index 0d9a4f1ac2..633ce0341f 100644 --- a/helix-core/src/main/java/org/apache/helix/tools/ClusterSetup.java +++ b/helix-core/src/main/java/org/apache/helix/tools/ClusterSetup.java @@ -313,7 +313,7 @@ public void dropInstanceFromCluster(String clusterName, String instanceId) { ClusterConfig clusterConfig = accessor.getProperty(keyBuilder.clusterConfig()); // ensure node is disabled, otherwise fail - if (InstanceValidationUtil.isInstanceEnabled(config, clusterConfig)) { + if (config.getInstanceEnabled()) { String error = "Node " + instanceId + " is enabled, cannot drop"; _logger.warn(error); throw new HelixException(error); diff --git a/helix-core/src/main/java/org/apache/helix/tools/ClusterVerifiers/StrictMatchExternalViewVerifier.java b/helix-core/src/main/java/org/apache/helix/tools/ClusterVerifiers/StrictMatchExternalViewVerifier.java index d0da9ba8eb..7ed44f825c 100644 --- a/helix-core/src/main/java/org/apache/helix/tools/ClusterVerifiers/StrictMatchExternalViewVerifier.java +++ b/helix-core/src/main/java/org/apache/helix/tools/ClusterVerifiers/StrictMatchExternalViewVerifier.java @@ -331,7 +331,7 @@ private Map> computeIdealPartitionState( for (String partition : idealState.getPartitionSet()) { List preferenceList = AbstractRebalancer.getPreferenceList(new Partition(partition), - idealState, cache.getAssignableEnabledLiveInstances()); + idealState, cache.getEnabledLiveInstances()); Map idealMapping; if (_isDeactivatedNodeAware) { idealMapping = HelixUtil.computeIdealMapping(preferenceList, stateModelDef, @@ -339,7 +339,7 @@ private Map> computeIdealPartitionState( cache.getDisabledInstancesForPartition(idealState.getResourceName(), partition)); } else { idealMapping = HelixUtil.computeIdealMapping(preferenceList, stateModelDef, - cache.getAssignableEnabledLiveInstances(), + cache.getEnabledLiveInstances(), Collections.emptySet()); } idealPartitionState.put(partition, idealMapping); diff --git a/helix-core/src/main/java/org/apache/helix/util/HelixUtil.java b/helix-core/src/main/java/org/apache/helix/util/HelixUtil.java index 4a3d49b73a..834b846783 100644 --- a/helix-core/src/main/java/org/apache/helix/util/HelixUtil.java +++ b/helix-core/src/main/java/org/apache/helix/util/HelixUtil.java @@ -399,7 +399,7 @@ public static Map> getIdealAssignmentForFullAuto( // Remove all disabled instances so that Helix will not consider them live. List disabledInstance = instanceConfigs.stream() - .filter(instanceConfig -> !InstanceValidationUtil.isInstanceEnabled(instanceConfig, clusterConfig)) + .filter(instanceConfig -> !instanceConfig.getInstanceEnabled()) .map(InstanceConfig::getInstanceName) .collect(Collectors.toList()); liveInstances.removeAll(disabledInstance); diff --git a/helix-core/src/main/java/org/apache/helix/util/InstanceValidationUtil.java b/helix-core/src/main/java/org/apache/helix/util/InstanceValidationUtil.java index 2542ecf7fb..5dea683346 100644 --- a/helix-core/src/main/java/org/apache/helix/util/InstanceValidationUtil.java +++ b/helix-core/src/main/java/org/apache/helix/util/InstanceValidationUtil.java @@ -73,22 +73,22 @@ private InstanceValidationUtil() { public static boolean isEnabled(HelixDataAccessor dataAccessor, String instanceName) { PropertyKey.Builder propertyKeyBuilder = dataAccessor.keyBuilder(); InstanceConfig instanceConfig = dataAccessor.getProperty(propertyKeyBuilder.instanceConfig(instanceName)); - ClusterConfig clusterConfig = dataAccessor.getProperty(propertyKeyBuilder.clusterConfig()); - // TODO deprecate instance level config checks once migrated the enable status to cluster config only - if (instanceConfig == null || clusterConfig == null) { - throw new HelixException("InstanceConfig or ClusterConfig is NULL"); + if (instanceConfig == null) { + throw new HelixException("InstanceConfig is NULL"); } - return isInstanceEnabled(instanceConfig, clusterConfig); - + return instanceConfig.getInstanceEnabled(); } /** * Check if the instance is enabled by configuration + * @deprecated Use {@link InstanceConfig#getInstanceEnabled()} instead. We will no longer + * be using cluster config to enable/disable instances. * @param instanceConfig * @param clusterConfig * @return */ + @Deprecated public static boolean isInstanceEnabled(InstanceConfig instanceConfig, ClusterConfig clusterConfig) { if (instanceConfig == null) { throw new HelixException("InstanceConfig is NULL"); diff --git a/helix-core/src/main/java/org/apache/helix/util/WeightAwareRebalanceUtil.java b/helix-core/src/main/java/org/apache/helix/util/WeightAwareRebalanceUtil.java index bc439549fe..18ddc0283d 100644 --- a/helix-core/src/main/java/org/apache/helix/util/WeightAwareRebalanceUtil.java +++ b/helix-core/src/main/java/org/apache/helix/util/WeightAwareRebalanceUtil.java @@ -29,6 +29,7 @@ import org.apache.helix.api.config.RebalanceConfig; import org.apache.helix.api.rebalancer.constraint.AbstractRebalanceHardConstraint; import org.apache.helix.api.rebalancer.constraint.AbstractRebalanceSoftConstraint; +import org.apache.helix.constants.InstanceConstants; import org.apache.helix.controller.common.PartitionStateMap; import org.apache.helix.controller.common.ResourcesStateMap; import org.apache.helix.controller.dataproviders.ResourceControllerDataProvider; @@ -77,7 +78,7 @@ public WeightAwareRebalanceUtil(ClusterConfig clusterConfig, List instanceConfigs) { for (InstanceConfig instanceConfig : instanceConfigs) { // ensure the instance is enabled - instanceConfig.setInstanceEnabled(true); + instanceConfig.setInstanceOperation(InstanceConstants.InstanceOperation.ENABLE); _instanceConfigMap.put(instanceConfig.getInstanceName(), instanceConfig); } // ensure no instance is disabled diff --git a/helix-core/src/test/java/org/apache/helix/common/ZkTestBase.java b/helix-core/src/test/java/org/apache/helix/common/ZkTestBase.java index 0218c3ffcb..a265605185 100644 --- a/helix-core/src/test/java/org/apache/helix/common/ZkTestBase.java +++ b/helix-core/src/test/java/org/apache/helix/common/ZkTestBase.java @@ -45,6 +45,7 @@ import org.apache.helix.SystemPropertyKeys; import org.apache.helix.TestHelper; import org.apache.helix.api.config.HelixConfigProperty; +import org.apache.helix.constants.InstanceConstants; import org.apache.helix.controller.pipeline.AbstractAsyncBaseStage; import org.apache.helix.controller.pipeline.Pipeline; import org.apache.helix.controller.pipeline.Stage; @@ -633,7 +634,6 @@ protected List setupLiveInstances(String clusterName, int[] liveIn for (int i = 0; i < liveInstances.length; i++) { String instance = "localhost_" + liveInstances[i]; - _liveInstanceOwners.putIfAbsent(clusterName, new HashMap<>()); Map clientMap = _liveInstanceOwners.get(clusterName); clientMap.putIfAbsent(instance, DedicatedZkClientFactory.getInstance() @@ -687,7 +687,7 @@ protected void setupInstances(String clusterName, int[] instances) { InstanceConfig instanceConfig = new InstanceConfig(instance); instanceConfig.setHostName("localhost"); instanceConfig.setPort("" + instances[i]); - instanceConfig.setInstanceEnabled(true); + instanceConfig.setInstanceOperation(InstanceConstants.InstanceOperation.ENABLE); admin.addInstance(clusterName, instanceConfig); } } diff --git a/helix-core/src/test/java/org/apache/helix/controller/changedetector/TestResourceChangeDetector.java b/helix-core/src/test/java/org/apache/helix/controller/changedetector/TestResourceChangeDetector.java index b4f405c745..bdfa2784ba 100644 --- a/helix-core/src/test/java/org/apache/helix/controller/changedetector/TestResourceChangeDetector.java +++ b/helix-core/src/test/java/org/apache/helix/controller/changedetector/TestResourceChangeDetector.java @@ -31,6 +31,7 @@ import org.apache.helix.PropertyKey; import org.apache.helix.TestHelper; import org.apache.helix.common.ZkTestBase; +import org.apache.helix.constants.InstanceConstants; import org.apache.helix.controller.dataproviders.ResourceControllerDataProvider; import org.apache.helix.integration.manager.ClusterControllerManager; import org.apache.helix.integration.manager.MockParticipantManager; @@ -399,7 +400,7 @@ public void testIgnoreNonTopologyChanges() { _dataAccessor.getProperty(_keyBuilder.instanceConfig(instanceName)); Assert.assertTrue(instanceConfig.getInstanceEnabled()); try { - instanceConfig.setInstanceEnabled(false); + instanceConfig.setInstanceOperation(InstanceConstants.InstanceOperation.DISABLE); _dataAccessor.updateProperty(_keyBuilder.instanceConfig(instanceName), instanceConfig); _dataProvider.notifyDataChange(ChangeType.INSTANCE_CONFIG); _dataProvider.refresh(_dataAccessor); @@ -410,7 +411,7 @@ public void testIgnoreNonTopologyChanges() { } finally { // remove newly added resource/ideastate _gSetupTool.getClusterManagementTool().dropResource(CLUSTER_NAME, resourceName); - instanceConfig.setInstanceEnabled(true); + instanceConfig.setInstanceOperation(InstanceConstants.InstanceOperation.ENABLE); _dataAccessor.updateProperty(_keyBuilder.instanceConfig(instanceName), instanceConfig); } } diff --git a/helix-core/src/test/java/org/apache/helix/controller/rebalancer/TestAbstractRebalancer.java b/helix-core/src/test/java/org/apache/helix/controller/rebalancer/TestAbstractRebalancer.java index 45c35f3605..f8af3ee419 100644 --- a/helix-core/src/test/java/org/apache/helix/controller/rebalancer/TestAbstractRebalancer.java +++ b/helix-core/src/test/java/org/apache/helix/controller/rebalancer/TestAbstractRebalancer.java @@ -56,6 +56,8 @@ public void testComputeBestPossibleState(String comment, String stateModelName, new IdealState("test"), new ClusterConfig("TestCluster"), partition, MonitoredAbnormalResolver.DUMMY_STATE_RESOLVER); + System.out.println("Expected best possible state map: " + expectedBestPossibleMap); + System.out.println("Actual best possible state map: " + bestPossibleMap); Assert.assertTrue(bestPossibleMap.equals(expectedBestPossibleMap)); } diff --git a/helix-core/src/test/java/org/apache/helix/controller/rebalancer/waged/TestWagedRebalancer.java b/helix-core/src/test/java/org/apache/helix/controller/rebalancer/waged/TestWagedRebalancer.java index 951e0e3c52..a554283311 100644 --- a/helix-core/src/test/java/org/apache/helix/controller/rebalancer/waged/TestWagedRebalancer.java +++ b/helix-core/src/test/java/org/apache/helix/controller/rebalancer/waged/TestWagedRebalancer.java @@ -118,9 +118,9 @@ protected ResourceControllerDataProvider setupClusterDataCache() throws IOExcept liveInstanceMap.put(instanceName, testLiveInstance); when(testCache.getAssignableLiveInstances()).thenReturn(liveInstanceMap); when(testCache.getLiveInstances()).thenReturn(liveInstanceMap); - when(testCache.getAssignableEnabledInstances()).thenReturn(liveInstanceMap.keySet()); when(testCache.getEnabledInstances()).thenReturn(liveInstanceMap.keySet()); - when(testCache.getAssignableEnabledLiveInstances()).thenReturn(liveInstanceMap.keySet()); + when(testCache.getEnabledInstances()).thenReturn(liveInstanceMap.keySet()); + when(testCache.getEnabledLiveInstances()).thenReturn(liveInstanceMap.keySet()); when(testCache.getEnabledLiveInstances()).thenReturn(liveInstanceMap.keySet()); when(testCache.getAssignableInstances()).thenReturn(_instances); when(testCache.getAllInstances()).thenReturn(_instances); @@ -375,7 +375,7 @@ public void testInvalidClusterStatus() throws IOException, HelixRebalanceExcepti Collectors.toMap(resourceName -> resourceName, Resource::new)); try { rebalancer.computeBestPossibleAssignment(clusterData, resourceMap, - clusterData.getAssignableEnabledLiveInstances(), new CurrentStateOutput(), _algorithm); + clusterData.getEnabledLiveInstances(), new CurrentStateOutput(), _algorithm); Assert.fail("Rebalance shall fail."); } catch (HelixRebalanceException ex) { Assert.assertEquals(ex.getFailureType(), HelixRebalanceException.Type.FAILED_TO_CALCULATE); @@ -439,7 +439,7 @@ public void testAlgorithmException() // Calculation will fail try { rebalancer.computeBestPossibleAssignment(clusterData, resourceMap, - clusterData.getAssignableEnabledLiveInstances(), new CurrentStateOutput(), badAlgorithm); + clusterData.getEnabledLiveInstances(), new CurrentStateOutput(), badAlgorithm); Assert.fail("Rebalance shall fail."); } catch (HelixRebalanceException ex) { Assert.assertEquals(ex.getFailureType(), HelixRebalanceException.Type.FAILED_TO_CALCULATE); @@ -749,8 +749,8 @@ public void testRebalanceOverwrite() throws HelixRebalanceException, IOException Set instances = new HashSet<>(_instances); instances.add(offlineInstance); when(clusterData.getAssignableInstances()).thenReturn(instances); - when(clusterData.getAssignableEnabledInstances()).thenReturn(instances); - when(clusterData.getAssignableEnabledLiveInstances()).thenReturn( + when(clusterData.getEnabledInstances()).thenReturn(instances); + when(clusterData.getEnabledLiveInstances()).thenReturn( new HashSet<>(Arrays.asList(instance0, instance1, instance2))); Map instanceOfflineTimeMap = new HashMap<>(); instanceOfflineTimeMap.put(offlineInstance, System.currentTimeMillis() + Integer.MAX_VALUE); @@ -894,8 +894,8 @@ public void testInstanceCapacityProvider() throws IOException, HelixRebalanceExc // force create a fake offlineInstance that's in delay window Set instances = new HashSet<>(_instances); when(clusterData.getAssignableInstances()).thenReturn(instances); - when(clusterData.getAssignableEnabledInstances()).thenReturn(instances); - when(clusterData.getAssignableEnabledLiveInstances()).thenReturn(instances); + when(clusterData.getEnabledInstances()).thenReturn(instances); + when(clusterData.getEnabledLiveInstances()).thenReturn(instances); Map instanceConfigMap = clusterData.getAssignableInstanceConfigMap(); when(clusterData.getAssignableInstanceConfigMap()).thenReturn(instanceConfigMap); when(clusterData.getInstanceConfigMap()).thenReturn(instanceConfigMap); diff --git a/helix-core/src/test/java/org/apache/helix/controller/rebalancer/waged/TestWagedRebalancerMetrics.java b/helix-core/src/test/java/org/apache/helix/controller/rebalancer/waged/TestWagedRebalancerMetrics.java index c5c7b560c6..3fb05e5f8b 100644 --- a/helix-core/src/test/java/org/apache/helix/controller/rebalancer/waged/TestWagedRebalancerMetrics.java +++ b/helix-core/src/test/java/org/apache/helix/controller/rebalancer/waged/TestWagedRebalancerMetrics.java @@ -266,9 +266,9 @@ protected ResourceControllerDataProvider setupClusterDataCache() throws IOExcept liveInstanceMap.put(instanceName, testLiveInstance); when(testCache.getAssignableLiveInstances()).thenReturn(liveInstanceMap); when(testCache.getLiveInstances()).thenReturn(liveInstanceMap); - when(testCache.getAssignableEnabledInstances()).thenReturn(liveInstanceMap.keySet()); when(testCache.getEnabledInstances()).thenReturn(liveInstanceMap.keySet()); - when(testCache.getAssignableEnabledLiveInstances()).thenReturn(liveInstanceMap.keySet()); + when(testCache.getEnabledInstances()).thenReturn(liveInstanceMap.keySet()); + when(testCache.getEnabledLiveInstances()).thenReturn(liveInstanceMap.keySet()); when(testCache.getEnabledLiveInstances()).thenReturn(liveInstanceMap.keySet()); when(testCache.getAssignableInstances()).thenReturn(_instances); when(testCache.getAllInstances()).thenReturn(_instances); diff --git a/helix-core/src/test/java/org/apache/helix/controller/rebalancer/waged/model/AbstractTestClusterModel.java b/helix-core/src/test/java/org/apache/helix/controller/rebalancer/waged/model/AbstractTestClusterModel.java index c9deb792de..b75a340933 100644 --- a/helix-core/src/test/java/org/apache/helix/controller/rebalancer/waged/model/AbstractTestClusterModel.java +++ b/helix-core/src/test/java/org/apache/helix/controller/rebalancer/waged/model/AbstractTestClusterModel.java @@ -29,6 +29,7 @@ import java.util.Set; import java.util.stream.Collectors; +import org.apache.helix.constants.InstanceConstants; import org.apache.helix.controller.dataproviders.ResourceControllerDataProvider; import org.apache.helix.controller.rebalancer.constraint.MonitoredAbnormalResolver; import org.apache.helix.model.BuiltInStateModelDefinitions; @@ -84,7 +85,7 @@ protected InstanceConfig createMockInstanceConfig(String instanceId) { InstanceConfig testInstanceConfig = new InstanceConfig(instanceId); testInstanceConfig.setInstanceCapacityMap(_capacityDataMap); testInstanceConfig.addTag(_testInstanceTags.get(0)); - testInstanceConfig.setInstanceEnabled(true); + testInstanceConfig.setInstanceOperation(InstanceConstants.InstanceOperation.ENABLE); testInstanceConfig.setZoneId(_testFaultZoneId); return testInstanceConfig; } diff --git a/helix-core/src/test/java/org/apache/helix/controller/rebalancer/waged/model/TestClusterModelProvider.java b/helix-core/src/test/java/org/apache/helix/controller/rebalancer/waged/model/TestClusterModelProvider.java index 34582d600d..2e41b6dbea 100644 --- a/helix-core/src/test/java/org/apache/helix/controller/rebalancer/waged/model/TestClusterModelProvider.java +++ b/helix-core/src/test/java/org/apache/helix/controller/rebalancer/waged/model/TestClusterModelProvider.java @@ -106,7 +106,7 @@ public void testFindToBeAssignedReplicasForMinActiveReplica() throws IOException activeInstances.add(instance1); activeInstances.add(instance2); when(testCache.getAssignableLiveInstances()).thenReturn(liveInstanceMap); - when(testCache.getAssignableEnabledLiveInstances()).thenReturn(activeInstances); + when(testCache.getEnabledLiveInstances()).thenReturn(activeInstances); // test 0, empty input Assert.assertEquals( @@ -144,7 +144,7 @@ public void testFindToBeAssignedReplicasForMinActiveReplica() throws IOException // test 2, no additional replica to be assigned testCache = setupClusterDataCache(); when(testCache.getAssignableLiveInstances()).thenReturn(liveInstanceMap); - when(testCache.getAssignableEnabledLiveInstances()).thenReturn(activeInstances); + when(testCache.getEnabledLiveInstances()).thenReturn(activeInstances); input = ImmutableMap.of( _resourceNames.get(0), ImmutableMap.of( @@ -169,7 +169,7 @@ public void testFindToBeAssignedReplicasForMinActiveReplica() throws IOException // test 3, minActiveReplica==2, two partitions falling short testCache = setupClusterDataCache(); when(testCache.getAssignableLiveInstances()).thenReturn(liveInstanceMap); - when(testCache.getAssignableEnabledLiveInstances()).thenReturn(activeInstances); + when(testCache.getEnabledLiveInstances()).thenReturn(activeInstances); input = ImmutableMap.of( _resourceNames.get(0), ImmutableMap.of( @@ -207,7 +207,7 @@ public void testClusterModelForDelayedRebalanceOverwrite() throws IOException { activeInstances.add(instance1); activeInstances.add(instance2); when(testCache.getAssignableLiveInstances()).thenReturn(liveInstanceMap); - when(testCache.getAssignableEnabledLiveInstances()).thenReturn(activeInstances); + when(testCache.getEnabledLiveInstances()).thenReturn(activeInstances); // test 1, one partition under minActiveReplica Map>> input = ImmutableMap.of( @@ -247,7 +247,7 @@ public void testClusterModelForDelayedRebalanceOverwrite() throws IOException { // test 2, minActiveReplica==2, three partitions falling short testCache = setupClusterDataCache(); when(testCache.getAssignableLiveInstances()).thenReturn(liveInstanceMap); - when(testCache.getAssignableEnabledLiveInstances()).thenReturn(activeInstances); + when(testCache.getEnabledLiveInstances()).thenReturn(activeInstances); input = ImmutableMap.of( _resourceNames.get(0), ImmutableMap.of( diff --git a/helix-core/src/test/java/org/apache/helix/controller/stages/TestBestPossibleCalcStageCompatibility.java b/helix-core/src/test/java/org/apache/helix/controller/stages/TestBestPossibleCalcStageCompatibility.java index 0027f8e4ef..2a548ce457 100644 --- a/helix-core/src/test/java/org/apache/helix/controller/stages/TestBestPossibleCalcStageCompatibility.java +++ b/helix-core/src/test/java/org/apache/helix/controller/stages/TestBestPossibleCalcStageCompatibility.java @@ -58,6 +58,7 @@ public void testSemiAutoModeCompatibility() { event.addAttribute(AttributeName.RESOURCES.name(), resourceMap); event.addAttribute(AttributeName.RESOURCES_TO_REBALANCE.name(), resourceMap); event.addAttribute(AttributeName.CURRENT_STATE.name(), currentStateOutput); + event.addAttribute(AttributeName.CURRENT_STATE_EXCLUDING_UNKNOWN.name(), currentStateOutput); event.addAttribute(AttributeName.ControllerDataProvider.name(), new ResourceControllerDataProvider()); @@ -94,6 +95,7 @@ public void testCustomModeCompatibility() { event.addAttribute(AttributeName.RESOURCES.name(), resourceMap); event.addAttribute(AttributeName.RESOURCES_TO_REBALANCE.name(), resourceMap); event.addAttribute(AttributeName.CURRENT_STATE.name(), currentStateOutput); + event.addAttribute(AttributeName.CURRENT_STATE_EXCLUDING_UNKNOWN.name(), currentStateOutput); event.addAttribute(AttributeName.ControllerDataProvider.name(), new ResourceControllerDataProvider()); diff --git a/helix-core/src/test/java/org/apache/helix/controller/stages/TestBestPossibleStateCalcStage.java b/helix-core/src/test/java/org/apache/helix/controller/stages/TestBestPossibleStateCalcStage.java index e33dc9f5da..518c610be0 100644 --- a/helix-core/src/test/java/org/apache/helix/controller/stages/TestBestPossibleStateCalcStage.java +++ b/helix-core/src/test/java/org/apache/helix/controller/stages/TestBestPossibleStateCalcStage.java @@ -58,6 +58,7 @@ public void testSimple() { event.addAttribute(AttributeName.RESOURCES.name(), resourceMap); event.addAttribute(AttributeName.RESOURCES_TO_REBALANCE.name(), resourceMap); event.addAttribute(AttributeName.CURRENT_STATE.name(), currentStateOutput); + event.addAttribute(AttributeName.CURRENT_STATE_EXCLUDING_UNKNOWN.name(), currentStateOutput); event.addAttribute(AttributeName.ControllerDataProvider.name(), new ResourceControllerDataProvider()); ReadClusterDataStage stage1 = new ReadClusterDataStage(); @@ -117,6 +118,7 @@ public void testAutoEnterMaintenanceWhenExceedingOfflineNodes() { event.addAttribute(AttributeName.RESOURCES.name(), resourceMap); event.addAttribute(AttributeName.RESOURCES_TO_REBALANCE.name(), resourceMap); event.addAttribute(AttributeName.CURRENT_STATE.name(), currentStateOutput); + event.addAttribute(AttributeName.CURRENT_STATE_EXCLUDING_UNKNOWN.name(), currentStateOutput); event.addAttribute(AttributeName.ControllerDataProvider.name(), new ResourceControllerDataProvider()); diff --git a/helix-core/src/test/java/org/apache/helix/controller/stages/TestCancellationMessageGeneration.java b/helix-core/src/test/java/org/apache/helix/controller/stages/TestCancellationMessageGeneration.java index 7b891522cf..7d815c170e 100644 --- a/helix-core/src/test/java/org/apache/helix/controller/stages/TestCancellationMessageGeneration.java +++ b/helix-core/src/test/java/org/apache/helix/controller/stages/TestCancellationMessageGeneration.java @@ -68,6 +68,7 @@ public void TestOFFLINEToDROPPED() throws Exception { when(message.getToState()).thenReturn("SLAVE"); when(currentStateOutput.getPendingMessage(TEST_RESOURCE, partition, TEST_INSTANCE)).thenReturn(message); event.addAttribute(AttributeName.CURRENT_STATE.name(), currentStateOutput); + event.addAttribute(AttributeName.CURRENT_STATE_EXCLUDING_UNKNOWN.name(), currentStateOutput); // Set helix manager to event event.addAttribute(AttributeName.helixmanager.name(), mock(HelixManager.class)); @@ -157,6 +158,7 @@ private List generateMessages(String currentState, String fromState, St when(currentStateOutput.getPendingMessage(TEST_RESOURCE, partition, TEST_INSTANCE)) .thenReturn(pendingMessage); event.addAttribute(AttributeName.CURRENT_STATE.name(), currentStateOutput); + event.addAttribute(AttributeName.CURRENT_STATE_EXCLUDING_UNKNOWN.name(), currentStateOutput); // Set helix manager to event event.addAttribute(AttributeName.helixmanager.name(), mock(HelixManager.class)); diff --git a/helix-core/src/test/java/org/apache/helix/controller/stages/TestIntermediateStateCalcStage.java b/helix-core/src/test/java/org/apache/helix/controller/stages/TestIntermediateStateCalcStage.java index 7da3d64c25..f15e6b87dd 100644 --- a/helix-core/src/test/java/org/apache/helix/controller/stages/TestIntermediateStateCalcStage.java +++ b/helix-core/src/test/java/org/apache/helix/controller/stages/TestIntermediateStateCalcStage.java @@ -177,6 +177,7 @@ public void testNoStateMissing() { event.addAttribute(AttributeName.BEST_POSSIBLE_STATE.name(), bestPossibleStateOutput); event.addAttribute(AttributeName.MESSAGES_SELECTED.name(), messageSelectOutput); event.addAttribute(AttributeName.CURRENT_STATE.name(), currentStateOutput); + event.addAttribute(AttributeName.CURRENT_STATE_EXCLUDING_UNKNOWN.name(), currentStateOutput); event.addAttribute(AttributeName.ControllerDataProvider.name(), new ResourceControllerDataProvider()); runStage(event, new ReadClusterDataStage()); runStage(event, new IntermediateStateCalcStage()); @@ -261,6 +262,7 @@ public void testWithClusterConfigChange() { event.addAttribute(AttributeName.BEST_POSSIBLE_STATE.name(), bestPossibleStateOutput); event.addAttribute(AttributeName.CURRENT_STATE.name(), currentStateOutput); + event.addAttribute(AttributeName.CURRENT_STATE_EXCLUDING_UNKNOWN.name(), currentStateOutput); event.addAttribute(AttributeName.MESSAGES_SELECTED.name(), messageSelectOutput); event.addAttribute(AttributeName.ControllerDataProvider.name(), new ResourceControllerDataProvider()); runStage(event, new ReadClusterDataStage()); @@ -379,6 +381,7 @@ public void testThrottleByErrorPartition() { event.addAttribute(AttributeName.BEST_POSSIBLE_STATE.name(), bestPossibleStateOutput); event.addAttribute(AttributeName.CURRENT_STATE.name(), currentStateOutput); + event.addAttribute(AttributeName.CURRENT_STATE_EXCLUDING_UNKNOWN.name(), currentStateOutput); event.addAttribute(AttributeName.MESSAGES_SELECTED.name(), messageSelectOutput); event.addAttribute(AttributeName.ControllerDataProvider.name(), new ResourceControllerDataProvider()); @@ -553,6 +556,7 @@ public void testPartitionMissing() { event.addAttribute(AttributeName.BEST_POSSIBLE_STATE.name(), bestPossibleStateOutput); event.addAttribute(AttributeName.MESSAGES_SELECTED.name(), messageSelectOutput); event.addAttribute(AttributeName.CURRENT_STATE.name(), currentStateOutput); + event.addAttribute(AttributeName.CURRENT_STATE_EXCLUDING_UNKNOWN.name(), currentStateOutput); event.addAttribute(AttributeName.ControllerDataProvider.name(), new ResourceControllerDataProvider()); runStage(event, new ReadClusterDataStage()); runStage(event, new IntermediateStateCalcStage()); diff --git a/helix-core/src/test/java/org/apache/helix/controller/stages/TestManagementMessageGeneration.java b/helix-core/src/test/java/org/apache/helix/controller/stages/TestManagementMessageGeneration.java index 7c20b0279f..f1a3da4415 100644 --- a/helix-core/src/test/java/org/apache/helix/controller/stages/TestManagementMessageGeneration.java +++ b/helix-core/src/test/java/org/apache/helix/controller/stages/TestManagementMessageGeneration.java @@ -84,6 +84,7 @@ private List generateMessages(String currentState, String fromState, St when(currentStateOutput.getPendingMessage(TEST_RESOURCE, partition, TEST_INSTANCE)) .thenReturn(pendingMessage); event.addAttribute(AttributeName.CURRENT_STATE.name(), currentStateOutput); + event.addAttribute(AttributeName.CURRENT_STATE_EXCLUDING_UNKNOWN.name(), currentStateOutput); // Set helix manager to event event.addAttribute(AttributeName.helixmanager.name(), mock(HelixManager.class)); diff --git a/helix-core/src/test/java/org/apache/helix/controller/stages/TestManagementModeStage.java b/helix-core/src/test/java/org/apache/helix/controller/stages/TestManagementModeStage.java index 1c6162428b..0c144d8eb3 100644 --- a/helix-core/src/test/java/org/apache/helix/controller/stages/TestManagementModeStage.java +++ b/helix-core/src/test/java/org/apache/helix/controller/stages/TestManagementModeStage.java @@ -51,6 +51,7 @@ public class TestManagementModeStage extends ZkTestBase { public void beforeClass() { _clusterName = "CLUSTER_" + TestHelper.getTestClassName(); _accessor = new ZKHelixDataAccessor(_clusterName, new ZkBaseDataAccessor<>(_gZkClient)); + _gSetupTool.setupTestCluster(_clusterName); _manager = new DummyClusterManager(_clusterName, _accessor); } @@ -65,6 +66,8 @@ public void testClusterFreezeStatus() throws Exception { // ideal state: node0 is MASTER, node1 is SLAVE // replica=2 means 1 master and 1 slave setupIdealState(_clusterName, new int[]{0, 1}, new String[]{"TestDB"}, 1, 2); + _gSetupTool.addInstanceToCluster(_clusterName, "localhost_0"); + _gSetupTool.addInstanceToCluster(_clusterName, "localhost_1"); List liveInstances = setupLiveInstances(_clusterName, new int[]{0, 1}); setupStateModel(_clusterName); @@ -96,7 +99,7 @@ public void testClusterFreezeStatus() throws Exception { ControllerHistory history = _accessor.getProperty(_accessor.keyBuilder().controllerLeaderHistory()); - Assert.assertNull(history); + Assert.assertTrue(history.getMaintenanceHistoryList().isEmpty()); // Mark both live instances to be frozen, then entering freeze mode is complete for (int i = 0; i < 2; i++) { diff --git a/helix-core/src/test/java/org/apache/helix/controller/stages/TestReplicaLevelThrottling.java b/helix-core/src/test/java/org/apache/helix/controller/stages/TestReplicaLevelThrottling.java index e4aeed04f8..515340d7cd 100644 --- a/helix-core/src/test/java/org/apache/helix/controller/stages/TestReplicaLevelThrottling.java +++ b/helix-core/src/test/java/org/apache/helix/controller/stages/TestReplicaLevelThrottling.java @@ -71,7 +71,7 @@ private void prepareCache(Map cacheMap, Mock mock) { when(mock.cache.getClusterConfig()).thenReturn((ClusterConfig) cacheMap.get(CacheKeys.clusterConfig.name())); when(mock.cache.getStateModelDef((String) cacheMap.get(CacheKeys.stateModelName.name()))).thenReturn( (StateModelDefinition) cacheMap.get(CacheKeys.stateModelDef.name())); - when(mock.cache.getAssignableEnabledLiveInstances()).thenReturn(new HashSet<>( + when(mock.cache.getEnabledLiveInstances()).thenReturn(new HashSet<>( ((Map>) cacheMap.get(CacheKeys.preferenceList.name())).values().iterator().next())); when(mock.cache.getLiveInstances()).thenReturn(new HashSet<>( ((Map>) cacheMap.get(CacheKeys.preferenceList.name())).values().iterator().next()).stream() @@ -189,6 +189,7 @@ public List loadTestInputs(String fileName) { } ClusterEvent event = new ClusterEvent(CLUSTER_NAME, ClusterEventType.Unknown); event.addAttribute(AttributeName.CURRENT_STATE.name(), currentStateOutput); // add current states + event.addAttribute(AttributeName.CURRENT_STATE_EXCLUDING_UNKNOWN.name(), currentStateOutput); // add current states event.addAttribute(AttributeName.ControllerDataProvider.name(), buildCache(mock, numReplica, minActiveReplica, stateModelDef, stateModelName, preferenceLists)); event.addAttribute(AttributeName.MESSAGES_SELECTED.name(), messageOutput); diff --git a/helix-core/src/test/java/org/apache/helix/controller/stages/TestStateTransitionPriority.java b/helix-core/src/test/java/org/apache/helix/controller/stages/TestStateTransitionPriority.java index 22c20f7dd2..e457e31cab 100644 --- a/helix-core/src/test/java/org/apache/helix/controller/stages/TestStateTransitionPriority.java +++ b/helix-core/src/test/java/org/apache/helix/controller/stages/TestStateTransitionPriority.java @@ -85,6 +85,7 @@ public void testResourceLevelPriorityForRecoveryBalance( event.addAttribute(AttributeName.BEST_POSSIBLE_STATE.name(), bestPossibleStateOutput); event.addAttribute(AttributeName.MESSAGES_SELECTED.name(), messageSelectOutput); event.addAttribute(AttributeName.CURRENT_STATE.name(), currentStateOutput); + event.addAttribute(AttributeName.CURRENT_STATE_EXCLUDING_UNKNOWN.name(), currentStateOutput); runStage(event, new ReadClusterDataStage()); // Keep update the current state. @@ -133,6 +134,7 @@ public void testResourceLevelPriorityForLoadBalance( event.addAttribute(AttributeName.BEST_POSSIBLE_STATE.name(), bestPossibleStateOutput); event.addAttribute(AttributeName.MESSAGES_SELECTED.name(), messageSelectOutput); event.addAttribute(AttributeName.CURRENT_STATE.name(), currentStateOutput); + event.addAttribute(AttributeName.CURRENT_STATE_EXCLUDING_UNKNOWN.name(), currentStateOutput); event.addAttribute(AttributeName.ControllerDataProvider.name(), new ResourceControllerDataProvider()); runStage(event, new ReadClusterDataStage()); @@ -194,6 +196,7 @@ public void testPartitionLevelPriority(String resourceName, event.addAttribute(AttributeName.MESSAGES_SELECTED.name(), generateMessageMapForPartition(bestPossibleMap, currentStateMap, Collections.emptyList(), resourceName)); event.addAttribute(AttributeName.CURRENT_STATE.name(), currentStateOutput); + event.addAttribute(AttributeName.CURRENT_STATE_EXCLUDING_UNKNOWN.name(), currentStateOutput); event.addAttribute(AttributeName.ControllerDataProvider.name(), new ResourceControllerDataProvider()); runStage(event, new ReadClusterDataStage()); @@ -350,6 +353,7 @@ private void updateCurrentOutput(List resourcePriority, resourcePriority.add(resourceName); currentStateOutput.setCurrentState(resourceName, partition, instanceName, state); event.addAttribute(AttributeName.CURRENT_STATE.name(), currentStateOutput); + event.addAttribute(AttributeName.CURRENT_STATE_EXCLUDING_UNKNOWN.name(), currentStateOutput); } private void updateCurrentStateForPartitionLevelPriority(List partitionPriority, diff --git a/helix-core/src/test/java/org/apache/helix/integration/TestAlertingRebalancerFailure.java b/helix-core/src/test/java/org/apache/helix/integration/TestAlertingRebalancerFailure.java index 2f1dee269f..1b2e54b92c 100644 --- a/helix-core/src/test/java/org/apache/helix/integration/TestAlertingRebalancerFailure.java +++ b/helix-core/src/test/java/org/apache/helix/integration/TestAlertingRebalancerFailure.java @@ -33,6 +33,7 @@ import org.apache.helix.HelixDataAccessor; import org.apache.helix.PropertyKey; import org.apache.helix.TestHelper; +import org.apache.helix.constants.InstanceConstants; import org.apache.helix.controller.rebalancer.strategy.CrushEdRebalanceStrategy; import org.apache.helix.controller.rebalancer.strategy.CrushRebalanceStrategy; import org.apache.helix.integration.common.ZkStandAloneCMTestBase; @@ -297,7 +298,8 @@ private void setDomainId(String instanceName, ConfigAccessor configAccessor) { private void setInstanceEnable(String instanceName, boolean enabled, ConfigAccessor configAccessor) { InstanceConfig instanceConfig = configAccessor.getInstanceConfig(CLUSTER_NAME, instanceName); - instanceConfig.setInstanceEnabled(enabled); + instanceConfig.setInstanceOperation(enabled ? InstanceConstants.InstanceOperation.ENABLE + : InstanceConstants.InstanceOperation.DISABLE); configAccessor.setInstanceConfig(CLUSTER_NAME, instanceName, instanceConfig); } diff --git a/helix-core/src/test/java/org/apache/helix/integration/TestDisableCustomCodeRunner.java b/helix-core/src/test/java/org/apache/helix/integration/TestDisableCustomCodeRunner.java index a5416d4d13..ce3077dd9f 100644 --- a/helix-core/src/test/java/org/apache/helix/integration/TestDisableCustomCodeRunner.java +++ b/helix-core/src/test/java/org/apache/helix/integration/TestDisableCustomCodeRunner.java @@ -32,6 +32,7 @@ import org.apache.helix.PropertyKey; import org.apache.helix.TestHelper; import org.apache.helix.ZkUnitTestBase; +import org.apache.helix.constants.InstanceConstants; import org.apache.helix.integration.manager.ClusterControllerManager; import org.apache.helix.integration.manager.MockParticipantManager; import org.apache.helix.manager.zk.ZKHelixAdmin; @@ -170,7 +171,7 @@ public void test() throws Exception { InstanceConfig instanceConfig = new InstanceConfig(fakeInstanceName); instanceConfig.setHostName("localhost"); instanceConfig.setPort("10000"); - instanceConfig.setInstanceEnabled(true); + instanceConfig.setInstanceOperation(InstanceConstants.InstanceOperation.ENABLE); admin.addInstance(clusterName, instanceConfig); LiveInstance fakeInstance = new LiveInstance(fakeInstanceName); diff --git a/helix-core/src/test/java/org/apache/helix/integration/TestNoThrottleDisabledPartitions.java b/helix-core/src/test/java/org/apache/helix/integration/TestNoThrottleDisabledPartitions.java index 98dd281b5e..9cb248f96f 100644 --- a/helix-core/src/test/java/org/apache/helix/integration/TestNoThrottleDisabledPartitions.java +++ b/helix-core/src/test/java/org/apache/helix/integration/TestNoThrottleDisabledPartitions.java @@ -28,6 +28,7 @@ import org.apache.helix.TestHelper; import org.apache.helix.api.config.StateTransitionThrottleConfig; import org.apache.helix.common.ZkTestBase; +import org.apache.helix.constants.InstanceConstants; import org.apache.helix.integration.manager.ClusterControllerManager; import org.apache.helix.integration.manager.MockParticipantManager; import org.apache.helix.manager.zk.ZKHelixDataAccessor; @@ -74,7 +75,7 @@ public void testDisablingTopStateReplicaByDisablingInstance() throws Exception { // Disable instance 0 so that it will cause a partition to do a load balance PropertyKey key = _accessor.keyBuilder().instanceConfig(_participants[0].getInstanceName()); InstanceConfig instanceConfig = _accessor.getProperty(key); - instanceConfig.setInstanceEnabled(false); + instanceConfig.setInstanceOperation(InstanceConstants.InstanceOperation.DISABLE); _accessor.setProperty(key, instanceConfig); // Resume the controller @@ -234,7 +235,7 @@ public void testNoThrottleOnDisabledInstance() throws Exception { // Disable an instance so that it will not be subject to throttling PropertyKey key = _accessor.keyBuilder().instanceConfig(_participants[0].getInstanceName()); InstanceConfig instanceConfig = _accessor.getProperty(key); - instanceConfig.setInstanceEnabled(false); + instanceConfig.setInstanceOperation(InstanceConstants.InstanceOperation.DISABLE); _accessor.setProperty(key, instanceConfig); // Set the state transition delay so that transitions would be processed slowly diff --git a/helix-core/src/test/java/org/apache/helix/integration/controller/TestClusterMaintenanceMode.java b/helix-core/src/test/java/org/apache/helix/integration/controller/TestClusterMaintenanceMode.java index ebfb03e8a0..6654098f8b 100644 --- a/helix-core/src/test/java/org/apache/helix/integration/controller/TestClusterMaintenanceMode.java +++ b/helix-core/src/test/java/org/apache/helix/integration/controller/TestClusterMaintenanceMode.java @@ -280,7 +280,7 @@ public void testMaxPartitionLimit() throws Exception { Assert.assertEquals(maintenanceSignal.getTriggeringEntity(), MaintenanceSignal.TriggeringEntity.CONTROLLER); Assert.assertEquals(maintenanceSignal.getAutoTriggerReason(), - MaintenanceSignal.AutoTriggerReason.MAX_OFFLINE_INSTANCES_EXCEEDED); + MaintenanceSignal.AutoTriggerReason.MAX_INSTANCES_UNABLE_TO_ACCEPT_ONLINE_REPLICAS); // Bring up all instances for (int i = 0; i < 3; i++) { @@ -306,7 +306,7 @@ public void testMaxPartitionLimit() throws Exception { Assert.assertEquals(maintenanceSignal.getTriggeringEntity(), MaintenanceSignal.TriggeringEntity.CONTROLLER); Assert.assertEquals(maintenanceSignal.getAutoTriggerReason(), - MaintenanceSignal.AutoTriggerReason.MAX_OFFLINE_INSTANCES_EXCEEDED); + MaintenanceSignal.AutoTriggerReason.MAX_INSTANCES_UNABLE_TO_ACCEPT_ONLINE_REPLICAS); // Set the cluster config for auto-exiting maintenance mode ClusterConfig clusterConfig = _manager.getConfigAccessor().getClusterConfig(CLUSTER_NAME); diff --git a/helix-core/src/test/java/org/apache/helix/integration/messaging/TestMessageThrottle2.java b/helix-core/src/test/java/org/apache/helix/integration/messaging/TestMessageThrottle2.java index b11e6350e5..a61ffea9ac 100644 --- a/helix-core/src/test/java/org/apache/helix/integration/messaging/TestMessageThrottle2.java +++ b/helix-core/src/test/java/org/apache/helix/integration/messaging/TestMessageThrottle2.java @@ -41,6 +41,7 @@ import org.apache.helix.NotificationContext; import org.apache.helix.PropertyKey.Builder; import org.apache.helix.TestHelper; +import org.apache.helix.constants.InstanceConstants; import org.apache.helix.zookeeper.datamodel.ZNRecord; import org.apache.helix.common.ZkTestBase; import org.apache.helix.controller.HelixControllerMain; @@ -379,7 +380,7 @@ private static void addInstanceConfig(String instanceName) { if (instanceConfig == null) { InstanceConfig config = new InstanceConfig(instanceName); config.setHostName("localhost"); - config.setInstanceEnabled(true); + config.setInstanceOperation(InstanceConstants.InstanceOperation.ENABLE); echo("Adding InstanceConfig:" + config); admin.addInstance(_clusterName, config); } diff --git a/helix-core/src/test/java/org/apache/helix/integration/rebalancer/CrushRebalancers/TestNodeSwap.java b/helix-core/src/test/java/org/apache/helix/integration/rebalancer/CrushRebalancers/TestNodeSwap.java index 3b13868507..fef7ea0b96 100644 --- a/helix-core/src/test/java/org/apache/helix/integration/rebalancer/CrushRebalancers/TestNodeSwap.java +++ b/helix-core/src/test/java/org/apache/helix/integration/rebalancer/CrushRebalancers/TestNodeSwap.java @@ -30,6 +30,7 @@ import org.apache.helix.ConfigAccessor; import org.apache.helix.TestHelper; import org.apache.helix.common.ZkTestBase; +import org.apache.helix.constants.InstanceConstants; import org.apache.helix.controller.rebalancer.strategy.CrushEdRebalanceStrategy; import org.apache.helix.controller.rebalancer.strategy.CrushRebalanceStrategy; import org.apache.helix.controller.rebalancer.strategy.MultiRoundCrushRebalanceStrategy; @@ -170,7 +171,7 @@ public void testNodeSwap(String rebalanceStrategyName, String rebalanceStrategyC final InstanceConfig instanceConfig = _gSetupTool.getClusterManagementTool().getInstanceConfig(CLUSTER_NAME, oldParticipantName); // disable the node first - instanceConfig.setInstanceEnabled(false); + instanceConfig.setInstanceOperation(InstanceConstants.InstanceOperation.DISABLE); _gSetupTool.getClusterManagementTool().setInstanceConfig(CLUSTER_NAME, oldParticipantName, instanceConfig); Assert.assertTrue(_clusterVerifier.verify(10000)); diff --git a/helix-core/src/test/java/org/apache/helix/integration/rebalancer/PartitionMigration/TestExpandCluster.java b/helix-core/src/test/java/org/apache/helix/integration/rebalancer/PartitionMigration/TestExpandCluster.java index 15b66af62d..cd5338ef44 100644 --- a/helix-core/src/test/java/org/apache/helix/integration/rebalancer/PartitionMigration/TestExpandCluster.java +++ b/helix-core/src/test/java/org/apache/helix/integration/rebalancer/PartitionMigration/TestExpandCluster.java @@ -21,6 +21,7 @@ import java.util.Map; +import org.apache.helix.constants.InstanceConstants; import org.apache.helix.integration.manager.MockParticipantManager; import org.apache.helix.model.ClusterConfig; import org.apache.helix.model.IdealState; diff --git a/helix-core/src/test/java/org/apache/helix/integration/rebalancer/TestInstanceOperation.java b/helix-core/src/test/java/org/apache/helix/integration/rebalancer/TestInstanceOperation.java index 1fc3a3e203..85600c01c1 100644 --- a/helix-core/src/test/java/org/apache/helix/integration/rebalancer/TestInstanceOperation.java +++ b/helix-core/src/test/java/org/apache/helix/integration/rebalancer/TestInstanceOperation.java @@ -99,7 +99,7 @@ public class TestInstanceOperation extends ZkTestBase { List _participantNames = new ArrayList<>(); private Set _allDBs = new HashSet<>(); private ZkHelixClusterVerifier _clusterVerifier; - private ZkHelixClusterVerifier _bestPossibleClusterVerifier; + private BestPossibleExternalViewVerifier _bestPossibleClusterVerifier; private ConfigAccessor _configAccessor; private long _stateModelDelay = 3L; @@ -204,7 +204,7 @@ private void disableTopologyAwareRebalance() { Assert.assertTrue(_clusterVerifier.verifyByPolling()); } - private void removeOfflineOrDisabledOrSwapInInstances() { + private void removeOfflineOrInactiveInstances() { // Remove all instances that are not live, disabled, or in SWAP_IN state. for (int i = 0; i < _participants.size(); i++) { String participantName = _participantNames.get(i); @@ -212,7 +212,7 @@ private void removeOfflineOrDisabledOrSwapInInstances() { _gSetupTool.getClusterManagementTool().getInstanceConfig(CLUSTER_NAME, participantName); if (!_participants.get(i).isConnected() || !instanceConfig.getInstanceEnabled() || instanceConfig.getInstanceOperation() - .equals(InstanceConstants.InstanceOperation.SWAP_IN.name())) { + .equals(InstanceConstants.InstanceOperation.SWAP_IN)) { if (_participants.get(i).isConnected()) { _participants.get(i).syncStop(); } @@ -268,11 +268,6 @@ public void testEvacuate() throws Exception { _gSetupTool.dropResourceFromCluster(CLUSTER_NAME, semiAutoDB); Assert.assertTrue(_clusterVerifier.verifyByPolling()); - // Disable, stop, and drop the instance from the cluster. - _gSetupTool.getClusterManagementTool().enableInstance(CLUSTER_NAME, instanceToEvacuate, false); - _participants.get(0).syncStop(); - removeOfflineOrDisabledOrSwapInInstances(); - // Compare the current ev with the previous one, it should be exactly the same since the baseline should not change // after the instance is dropped. Assert.assertTrue(_clusterVerifier.verifyByPolling()); @@ -284,9 +279,12 @@ public void testRevertEvacuation() throws Exception { System.out.println("START TestInstanceOperation.testRevertEvacuation() at " + new Date(System.currentTimeMillis())); // revert an evacuate instance String instanceToEvacuate = _participants.get(0).getInstanceName(); - _gSetupTool.getClusterManagementTool() - .setInstanceOperation(CLUSTER_NAME, instanceToEvacuate, null); + _gSetupTool.getClusterManagementTool().setInstanceOperation(CLUSTER_NAME, instanceToEvacuate, + InstanceConstants.InstanceOperation.ENABLE); + Assert.assertTrue( + _gSetupTool.getClusterManagementTool().getInstanceConfig(CLUSTER_NAME, instanceToEvacuate) + .getInstanceEnabled()); Assert.assertTrue(_clusterVerifier.verifyByPolling()); // EV should contain all participants, check resources one by one @@ -302,10 +300,12 @@ public void testAddingNodeWithEvacuationTag() throws Exception { System.out.println("START TestInstanceOperation.testAddingNodeWithEvacuationTag() at " + new Date(System.currentTimeMillis())); // first disable and instance, and wait for all replicas to be moved out String mockNewInstance = _participants.get(0).getInstanceName(); + // This is using a deprecated method to ensure that the disabling still takes precedence over the InstanceOperation when being set + // to false. _gSetupTool.getClusterManagementTool() .enableInstance(CLUSTER_NAME, mockNewInstance, false); Assert.assertTrue(_clusterVerifier.verifyByPolling()); - //ev should contain all instances but the disabled one + // ev should contain all instances but the disabled one Map assignment = getEVs(); List currentActiveInstances = _participantNames.stream().filter(n -> !n.equals(mockNewInstance)).collect(Collectors.toList()); @@ -317,10 +317,13 @@ public void testAddingNodeWithEvacuationTag() throws Exception { } // add evacuate tag and enable instance + // Because HELIX_ENABLED is set to false, getInstanceOperation still returns DISABLE _gSetupTool.getClusterManagementTool() .setInstanceOperation(CLUSTER_NAME, mockNewInstance, InstanceConstants.InstanceOperation.EVACUATE); - _gSetupTool.getClusterManagementTool() - .enableInstance(CLUSTER_NAME, mockNewInstance, true); + + // enable instance so InstanceOperation is no longer overriden with DISABLE + _gSetupTool.getClusterManagementTool().enableInstance(CLUSTER_NAME, mockNewInstance, true); + //ev should be the same assignment = getEVs(); currentActiveInstances = @@ -347,84 +350,73 @@ public void testAddingNodeWithEvacuationTag() throws Exception { } } - @Test(expectedExceptions = HelixException.class, dependsOnMethods = "testAddingNodeWithEvacuationTag") + @Test(dependsOnMethods = "testAddingNodeWithEvacuationTag") public void testNodeSwapNoTopologySetup() throws Exception { System.out.println("START TestInstanceOperation.testNodeSwapNoTopologySetup() at " + new Date( System.currentTimeMillis())); - removeOfflineOrDisabledOrSwapInInstances(); + removeOfflineOrInactiveInstances(); - // Set instance's InstanceOperation to SWAP_OUT String instanceToSwapOutName = _participants.get(0).getInstanceName(); - _gSetupTool.getClusterManagementTool().setInstanceOperation(CLUSTER_NAME, instanceToSwapOutName, - InstanceConstants.InstanceOperation.SWAP_OUT); - // Add instance with InstanceOperation set to SWAP_IN - // There should be an error that the logicalId does not have SWAP_OUT instance because, - // helix can't determine what topology key to use to get the logicalId if TOPOLOGY is not set. + // Add instance with InstanceOperation set to SWAP_IN as default + // The instance will be added with UNKNOWN because the logicalId will not match the + // swap out instance since the topology configs are not set. String instanceToSwapInName = PARTICIPANT_PREFIX + "_" + _nextStartPort; InstanceConfig instanceToSwapOutInstanceConfig = _gSetupTool.getClusterManagementTool() .getInstanceConfig(CLUSTER_NAME, instanceToSwapOutName); addParticipant(instanceToSwapInName, instanceToSwapOutInstanceConfig.getLogicalId(LOGICAL_ID), instanceToSwapOutInstanceConfig.getDomainAsMap().get(ZONE), - InstanceConstants.InstanceOperation.SWAP_IN, true, -1); + InstanceConstants.InstanceOperation.SWAP_IN, -1); + + Assert.assertEquals( + _gSetupTool.getClusterManagementTool().getInstanceConfig(CLUSTER_NAME, instanceToSwapInName) + .getInstanceOperation(), InstanceConstants.InstanceOperation.UNKNOWN); } - @Test(expectedExceptions = HelixException.class, dependsOnMethods = "testNodeSwapNoTopologySetup") - public void testAddingNodeWithSwapOutInstanceOperation() throws Exception { + @Test(dependsOnMethods = "testNodeSwapNoTopologySetup") + public void testAddingNodeWithEnableInstanceOperation() throws Exception { System.out.println( - "START TestInstanceOperation.testAddingNodeWithSwapOutInstanceOperation() at " + new Date( + "START TestInstanceOperation.testAddingNodeWithEnableInstanceOperation() at " + new Date( System.currentTimeMillis())); enabledTopologyAwareRebalance(); - removeOfflineOrDisabledOrSwapInInstances(); + removeOfflineOrInactiveInstances(); - // Set instance's InstanceOperation to SWAP_OUT String instanceToSwapOutName = _participants.get(0).getInstanceName(); InstanceConfig instanceToSwapOutInstanceConfig = _gSetupTool.getClusterManagementTool() .getInstanceConfig(CLUSTER_NAME, instanceToSwapOutName); - _gSetupTool.getClusterManagementTool().setInstanceOperation(CLUSTER_NAME, instanceToSwapOutName, - InstanceConstants.InstanceOperation.SWAP_OUT); - // Add instance with InstanceOperation set to SWAP_IN + // Add instance with InstanceOperation set to ENABLE + // The instance should be added with UNKNOWN since there is already an instance with + // the same logicalId in the cluster and this instance is not being set to SWAP_IN when + // added. String instanceToSwapInName = PARTICIPANT_PREFIX + "_" + _nextStartPort; addParticipant(instanceToSwapInName, instanceToSwapOutInstanceConfig.getLogicalId(LOGICAL_ID), instanceToSwapOutInstanceConfig.getDomainAsMap().get(ZONE), - InstanceConstants.InstanceOperation.SWAP_OUT, true, -1); - } - - @Test(expectedExceptions = HelixException.class, dependsOnMethods = "testAddingNodeWithSwapOutInstanceOperation") - public void testAddingNodeWithSwapOutNodeInstanceOperationUnset() throws Exception { - System.out.println( - "START TestInstanceOperation.testAddingNodeWithSwapOutNodeInstanceOperationUnset() at " - + new Date(System.currentTimeMillis())); - - removeOfflineOrDisabledOrSwapInInstances(); - - // Set instance's InstanceOperation to null - String instanceToSwapOutName = _participants.get(0).getInstanceName(); - InstanceConfig instanceToSwapOutInstanceConfig = _gSetupTool.getClusterManagementTool() - .getInstanceConfig(CLUSTER_NAME, instanceToSwapOutName); - _gSetupTool.getClusterManagementTool() - .setInstanceOperation(CLUSTER_NAME, instanceToSwapOutName, null); + InstanceConstants.InstanceOperation.ENABLE, -1); - // Add instance with InstanceOperation set to SWAP_IN - String instanceToSwapInName = PARTICIPANT_PREFIX + "_" + _nextStartPort; - addParticipant(instanceToSwapInName, instanceToSwapOutInstanceConfig.getLogicalId(LOGICAL_ID), - instanceToSwapOutInstanceConfig.getDomainAsMap().get(ZONE), - InstanceConstants.InstanceOperation.SWAP_IN, true, -1); + Assert.assertEquals( + _gSetupTool.getClusterManagementTool().getInstanceConfig(CLUSTER_NAME, instanceToSwapInName) + .getInstanceOperation(), InstanceConstants.InstanceOperation.UNKNOWN); } - @Test(expectedExceptions = HelixException.class, dependsOnMethods = "testAddingNodeWithSwapOutNodeInstanceOperationUnset") + @Test(dependsOnMethods = "testAddingNodeWithEnableInstanceOperation") public void testNodeSwapWithNoSwapOutNode() throws Exception { System.out.println("START TestInstanceOperation.testNodeSwapWithNoSwapOutNode() at " + new Date( System.currentTimeMillis())); - removeOfflineOrDisabledOrSwapInInstances(); + removeOfflineOrInactiveInstances(); // Add new instance with InstanceOperation set to SWAP_IN + // The instance should be added with UNKNOWN since there is not an instance with a matching + // logicalId in the cluster to swap with. String instanceToSwapInName = PARTICIPANT_PREFIX + "_" + _nextStartPort; addParticipant(instanceToSwapInName, "1000", "zone_1000", - InstanceConstants.InstanceOperation.SWAP_IN, true, -1); + InstanceConstants.InstanceOperation.SWAP_IN, -1); + + Assert.assertEquals( + _gSetupTool.getClusterManagementTool().getInstanceConfig(CLUSTER_NAME, instanceToSwapInName) + .getInstanceOperation(), InstanceConstants.InstanceOperation.UNKNOWN); } @Test(dependsOnMethods = "testNodeSwapWithNoSwapOutNode") @@ -433,21 +425,27 @@ public void testNodeSwapSwapInNodeNoInstanceOperationEnabled() throws Exception "START TestInstanceOperation.testNodeSwapSwapInNodeNoInstanceOperationEnabled() at " + new Date(System.currentTimeMillis())); - removeOfflineOrDisabledOrSwapInInstances(); + removeOfflineOrInactiveInstances(); - // Set instance's InstanceOperation to SWAP_OUT String instanceToSwapOutName = _participants.get(0).getInstanceName(); InstanceConfig instanceToSwapOutInstanceConfig = _gSetupTool.getClusterManagementTool() .getInstanceConfig(CLUSTER_NAME, instanceToSwapOutName); - _gSetupTool.getClusterManagementTool().setInstanceOperation(CLUSTER_NAME, instanceToSwapOutName, - InstanceConstants.InstanceOperation.SWAP_OUT); - // Add instance with same logicalId with InstanceOperation unset - // This should work because adding instance with InstanceOperation unset will automatically - // set the InstanceOperation to SWAP_IN. + // Add instance with same logicalId with InstanceOperation unset, this is the same as default + // which is ENABLE. + // The instance should be set to UNKNOWN since there is already a matching logicalId in the cluster. String instanceToSwapInName = PARTICIPANT_PREFIX + "_" + _nextStartPort; addParticipant(instanceToSwapInName, instanceToSwapOutInstanceConfig.getLogicalId(LOGICAL_ID), - instanceToSwapOutInstanceConfig.getDomainAsMap().get(ZONE), null, true, -1); + instanceToSwapOutInstanceConfig.getDomainAsMap().get(ZONE), null, -1); + + Assert.assertEquals( + _gSetupTool.getClusterManagementTool().getInstanceConfig(CLUSTER_NAME, instanceToSwapInName) + .getInstanceOperation(), InstanceConstants.InstanceOperation.UNKNOWN); + + // Setting the InstanceOperation to SWAP_IN should work because there is a matching logicalId in + // the cluster and the InstanceCapacityWeights and FaultZone match. + _gSetupTool.getClusterManagementTool().setInstanceOperation(CLUSTER_NAME, instanceToSwapInName, + InstanceConstants.InstanceOperation.SWAP_IN); Assert.assertTrue(_bestPossibleClusterVerifier.verifyByPolling()); Assert.assertTrue(_gSetupTool.getClusterManagementTool() @@ -461,20 +459,17 @@ public void testNodeSwapSwapInNodeWithAlreadySwappingPair() throws Exception { "START TestInstanceOperation.testNodeSwapSwapInNodeWithAlreadySwappingPair() at " + new Date(System.currentTimeMillis())); - removeOfflineOrDisabledOrSwapInInstances(); + removeOfflineOrInactiveInstances(); - // Set instance's InstanceOperation to SWAP_OUT String instanceToSwapOutName = _participants.get(0).getInstanceName(); InstanceConfig instanceToSwapOutInstanceConfig = _gSetupTool.getClusterManagementTool() .getInstanceConfig(CLUSTER_NAME, instanceToSwapOutName); - _gSetupTool.getClusterManagementTool().setInstanceOperation(CLUSTER_NAME, instanceToSwapOutName, - InstanceConstants.InstanceOperation.SWAP_OUT); // Add instance with InstanceOperation set to SWAP_IN String instanceToSwapInName = PARTICIPANT_PREFIX + "_" + _nextStartPort; addParticipant(instanceToSwapInName, instanceToSwapOutInstanceConfig.getLogicalId(LOGICAL_ID), instanceToSwapOutInstanceConfig.getDomainAsMap().get(ZONE), - InstanceConstants.InstanceOperation.SWAP_IN, true, -1); + InstanceConstants.InstanceOperation.SWAP_IN, -1); // Add another instance with InstanceOperation set to SWAP_IN with same logicalId as previously // added SWAP_IN instance. @@ -482,88 +477,70 @@ public void testNodeSwapSwapInNodeWithAlreadySwappingPair() throws Exception { addParticipant(secondInstanceToSwapInName, instanceToSwapOutInstanceConfig.getLogicalId(LOGICAL_ID), instanceToSwapOutInstanceConfig.getDomainAsMap().get(ZONE), - InstanceConstants.InstanceOperation.SWAP_IN, true, -1); - } - - @Test(expectedExceptions = HelixException.class, dependsOnMethods = "testNodeSwapSwapInNodeWithAlreadySwappingPair") - public void testNodeSwapWrongFaultZone() throws Exception { - System.out.println("START TestInstanceOperation.testNodeSwapWrongFaultZone() at " + new Date( - System.currentTimeMillis())); - removeOfflineOrDisabledOrSwapInInstances(); - - // Set instance's InstanceOperation to SWAP_OUT - String instanceToSwapOutName = _participants.get(0).getInstanceName(); - _gSetupTool.getClusterManagementTool().setInstanceOperation(CLUSTER_NAME, instanceToSwapOutName, - InstanceConstants.InstanceOperation.SWAP_OUT); - - // Add instance with InstanceOperation set to SWAP_IN - // There should be an error because SWAP_IN instance must be in the same FAULT_ZONE as the SWAP_OUT instance. - String instanceToSwapInName = PARTICIPANT_PREFIX + "_" + _nextStartPort; - InstanceConfig instanceToSwapOutInstanceConfig = _gSetupTool.getClusterManagementTool() - .getInstanceConfig(CLUSTER_NAME, instanceToSwapOutName); - addParticipant(instanceToSwapInName, instanceToSwapOutInstanceConfig.getLogicalId(LOGICAL_ID), - instanceToSwapOutInstanceConfig.getDomainAsMap().get(ZONE) + "1", - InstanceConstants.InstanceOperation.SWAP_IN, true, -1); - } + InstanceConstants.InstanceOperation.SWAP_IN, -1); - @Test(expectedExceptions = HelixException.class, dependsOnMethods = "testNodeSwapWrongFaultZone") - public void testNodeSwapWrongCapacity() throws Exception { - System.out.println("START TestInstanceOperation.testNodeSwapWrongCapacity() at " + new Date( - System.currentTimeMillis())); - removeOfflineOrDisabledOrSwapInInstances(); - - // Set instance's InstanceOperation to SWAP_OUT - String instanceToSwapOutName = _participants.get(0).getInstanceName(); - _gSetupTool.getClusterManagementTool().setInstanceOperation(CLUSTER_NAME, instanceToSwapOutName, - InstanceConstants.InstanceOperation.SWAP_OUT); + // Instance should be UNKNOWN since there was already a swapping pair. + Assert.assertEquals(_gSetupTool.getClusterManagementTool() + .getInstanceConfig(CLUSTER_NAME, secondInstanceToSwapInName).getInstanceOperation(), + InstanceConstants.InstanceOperation.UNKNOWN); - // Add instance with InstanceOperation set to SWAP_IN - // There should be an error because SWAP_IN instance must have same capacity as the SWAP_OUT node. - String instanceToSwapInName = PARTICIPANT_PREFIX + "_" + _nextStartPort; - InstanceConfig instanceToSwapOutInstanceConfig = _gSetupTool.getClusterManagementTool() - .getInstanceConfig(CLUSTER_NAME, instanceToSwapOutName); - addParticipant(instanceToSwapInName, instanceToSwapOutInstanceConfig.getLogicalId(LOGICAL_ID), - instanceToSwapOutInstanceConfig.getDomainAsMap().get(ZONE), - InstanceConstants.InstanceOperation.SWAP_IN, true, TEST_CAPACITY_VALUE - 10); + // Try to set the InstanceOperation to SWAP_IN, it should throw an exception since there is already + // a swapping pair. + _gSetupTool.getClusterManagementTool() + .setInstanceOperation(CLUSTER_NAME, secondInstanceToSwapInName, + InstanceConstants.InstanceOperation.SWAP_IN); } - @Test(dependsOnMethods = "testNodeSwapWrongCapacity") + @Test(dependsOnMethods = "testNodeSwapSwapInNodeWithAlreadySwappingPair") public void testNodeSwap() throws Exception { System.out.println( "START TestInstanceOperation.testNodeSwap() at " + new Date(System.currentTimeMillis())); - removeOfflineOrDisabledOrSwapInInstances(); - // Store original EV - Map originalEVs = getEVs(); + removeOfflineOrInactiveInstances(); Map swapOutInstancesToSwapInInstances = new HashMap<>(); - // Set instance's InstanceOperation to SWAP_OUT String instanceToSwapOutName = _participants.get(0).getInstanceName(); InstanceConfig instanceToSwapOutInstanceConfig = _gSetupTool.getClusterManagementTool() .getInstanceConfig(CLUSTER_NAME, instanceToSwapOutName); - _gSetupTool.getClusterManagementTool().setInstanceOperation(CLUSTER_NAME, instanceToSwapOutName, - InstanceConstants.InstanceOperation.SWAP_OUT); - // Validate that the assignment has not changed since setting the InstanceOperation to SWAP_OUT Assert.assertTrue(_clusterVerifier.verifyByPolling()); + + String resourceToDisablePartition = _allDBs.iterator().next(); + // Disable 1 partition that is assigned to the instance that will be swapped out. + getPartitionsAndStatesOnInstance(getEVs(), instanceToSwapOutName).entrySet().stream() + .filter(entry -> entry.getKey().startsWith(resourceToDisablePartition)).findFirst() + .ifPresent(entry -> { + String partition = entry.getKey(); + instanceToSwapOutInstanceConfig.setInstanceEnabledForPartition(resourceToDisablePartition, + partition, false); + }); + _gSetupTool.getClusterManagementTool() + .setInstanceConfig(CLUSTER_NAME, instanceToSwapOutName, instanceToSwapOutInstanceConfig); + + Assert.assertTrue(_clusterVerifier.verifyByPolling()); + + // Store original EV + Map originalEVs = getEVs(); + validateEVsCorrect(getEVs(), originalEVs, swapOutInstancesToSwapInInstances, Collections.emptySet(), Collections.emptySet()); + // Create a custom change listener to check if the throttles are enabled after the swap is completed. CustomIndividualInstanceConfigChangeListener instanceToSwapInInstanceConfigListener = new CustomIndividualInstanceConfigChangeListener(); + // Add instance with InstanceOperation set to SWAP_IN String instanceToSwapInName = PARTICIPANT_PREFIX + "_" + _nextStartPort; swapOutInstancesToSwapInInstances.put(instanceToSwapOutName, instanceToSwapInName); addParticipant(instanceToSwapInName, instanceToSwapOutInstanceConfig.getLogicalId(LOGICAL_ID), instanceToSwapOutInstanceConfig.getDomainAsMap().get(ZONE), - InstanceConstants.InstanceOperation.SWAP_IN, true, -1, instanceToSwapInInstanceConfigListener); + InstanceConstants.InstanceOperation.SWAP_IN, -1, instanceToSwapInInstanceConfigListener); + // Validate that the throttles are off since the InstanceOperation is set to SWAP_IN Assert.assertFalse(instanceToSwapInInstanceConfigListener.isThrottlesEnabled()); - // Validate that partitions on SWAP_OUT instance does not change after setting the InstanceOperation to SWAP_OUT - // and adding the SWAP_IN instance to the cluster. - // Check that the SWAP_IN instance has the same partitions as the SWAP_OUT instance + // Check that the SWAP_IN instance has the same partitions as the swap out instance // but none of them are in a top state. Assert.assertTrue(_bestPossibleClusterVerifier.verifyByPolling()); validateEVsCorrect(getEVs(), originalEVs, swapOutInstancesToSwapInInstances, @@ -573,7 +550,7 @@ public void testNodeSwap() throws Exception { Assert.assertTrue(_gSetupTool.getClusterManagementTool() .canCompleteSwap(CLUSTER_NAME, instanceToSwapOutName)); - // Validate that the SWAP_OUT instance is in routing tables and SWAP_IN is not. + // Validate that the swap out instance is in routing tables and SWAP_IN is not. validateRoutingTablesInstance(getEVs(), instanceToSwapOutName, true); validateRoutingTablesInstance(getEVs(), instanceToSwapInName, false); @@ -581,20 +558,31 @@ public void testNodeSwap() throws Exception { Assert.assertTrue(_gSetupTool.getClusterManagementTool() .completeSwapIfPossible(CLUSTER_NAME, instanceToSwapOutName, false)); + // Get both instanceConfigs and make sure correct fields are copied over. + InstanceConfig instanceToSwapInInstanceConfig = _gSetupTool.getClusterManagementTool() + .getInstanceConfig(CLUSTER_NAME, instanceToSwapInName); + + Assert.assertEquals(instanceToSwapInInstanceConfig.getRecord() + .getMapField(InstanceConfig.InstanceConfigProperty.HELIX_DISABLED_PARTITION.name()), + instanceToSwapInInstanceConfig.getRecord() + .getMapField(InstanceConfig.InstanceConfigProperty.HELIX_DISABLED_PARTITION.name())); + Assert.assertTrue(_clusterVerifier.verifyByPolling()); // Validate that the SWAP_IN instance is now in the routing tables. validateRoutingTablesInstance(getEVs(), instanceToSwapInName, true); - - // Assert that SWAP_OUT instance is disabled and has no partitions assigned to it. + // Assert that swap out instance is not active and has no partitions assigned to it. Assert.assertFalse(_gSetupTool.getClusterManagementTool() .getInstanceConfig(CLUSTER_NAME, instanceToSwapOutName).getInstanceEnabled()); + Assert.assertEquals(_gSetupTool.getClusterManagementTool() + .getInstanceConfig(CLUSTER_NAME, instanceToSwapOutName).getInstanceOperation(), + InstanceConstants.InstanceOperation.UNKNOWN); // Check to make sure the throttle was enabled again after the swap was completed. Assert.assertTrue(instanceToSwapInInstanceConfigListener.isThrottlesEnabled()); - // Validate that the SWAP_IN instance has the same partitions the SWAP_OUT instance had before + // Validate that the SWAP_IN instance has the same partitions the swap out instance had before // swap was completed. verifier(() -> (validateEVsCorrect(getEVs(), originalEVs, swapOutInstancesToSwapInInstances, Collections.emptySet(), ImmutableSet.of(instanceToSwapInName))), TIMEOUT); @@ -604,21 +592,18 @@ public void testNodeSwap() throws Exception { public void testNodeSwapDisableAndReenable() throws Exception { System.out.println( "START TestInstanceOperation.testNodeSwap() at " + new Date(System.currentTimeMillis())); - removeOfflineOrDisabledOrSwapInInstances(); + removeOfflineOrInactiveInstances(); // Store original EV Map originalEVs = getEVs(); Map swapOutInstancesToSwapInInstances = new HashMap<>(); - // Set instance's InstanceOperation to SWAP_OUT String instanceToSwapOutName = _participants.get(0).getInstanceName(); InstanceConfig instanceToSwapOutInstanceConfig = _gSetupTool.getClusterManagementTool() .getInstanceConfig(CLUSTER_NAME, instanceToSwapOutName); - _gSetupTool.getClusterManagementTool().setInstanceOperation(CLUSTER_NAME, instanceToSwapOutName, - InstanceConstants.InstanceOperation.SWAP_OUT); - // Validate that the assignment has not changed since setting the InstanceOperation to SWAP_OUT + // Validate that the assignment has not changed since setting the InstanceOperation to swap out Assert.assertTrue(_clusterVerifier.verifyByPolling()); validateEVsCorrect(getEVs(), originalEVs, swapOutInstancesToSwapInInstances, Collections.emptySet(), Collections.emptySet()); @@ -628,11 +613,9 @@ public void testNodeSwapDisableAndReenable() throws Exception { swapOutInstancesToSwapInInstances.put(instanceToSwapOutName, instanceToSwapInName); addParticipant(instanceToSwapInName, instanceToSwapOutInstanceConfig.getLogicalId(LOGICAL_ID), instanceToSwapOutInstanceConfig.getDomainAsMap().get(ZONE), - InstanceConstants.InstanceOperation.SWAP_IN, true, -1); + InstanceConstants.InstanceOperation.SWAP_IN, -1); - // Validate that partitions on SWAP_OUT instance does not change after setting the InstanceOperation to SWAP_OUT - // and adding the SWAP_IN instance to the cluster. - // Check that the SWAP_IN instance has the same partitions as the SWAP_OUT instance + // Check that the SWAP_IN instance has the same partitions as the swap out instance // but none of them are in a top state. Assert.assertTrue(_bestPossibleClusterVerifier.verifyByPolling()); validateEVsCorrect(getEVs(), originalEVs, swapOutInstancesToSwapInInstances, @@ -642,12 +625,12 @@ public void testNodeSwapDisableAndReenable() throws Exception { Assert.assertTrue(_gSetupTool.getClusterManagementTool() .canCompleteSwap(CLUSTER_NAME, instanceToSwapOutName)); - // Disable the SWAP_IN instance + // Try to disable the swap out instance, it should not do anything. _gSetupTool.getClusterManagementTool() - .enableInstance(CLUSTER_NAME, instanceToSwapInName, false); + .enableInstance(CLUSTER_NAME, instanceToSwapOutName, false); // Check that the SWAP_IN instance's replicas match the SWAP_OUT instance's replicas - // but all of them are OFFLINE + // and all of them are OFFLINE. Assert.assertTrue(_bestPossibleClusterVerifier.verifyByPolling()); Map> resourcePartitionStateOnSwapOutInstance = getResourcePartitionStateOnInstance(getEVs(), instanceToSwapOutName); @@ -658,14 +641,27 @@ public void testNodeSwapDisableAndReenable() throws Exception { .collect(Collectors.toSet()), resourcePartitionStateOnSwapOutInstance.values().stream().flatMap(p -> p.keySet().stream()) .collect(Collectors.toSet())); + Set swapOutInstancePartitionStates = + resourcePartitionStateOnSwapOutInstance.values().stream().flatMap(e -> e.values().stream()) + .collect(Collectors.toSet()); + Assert.assertEquals(swapOutInstancePartitionStates.size(), 1); + Assert.assertTrue(swapOutInstancePartitionStates.contains("OFFLINE")); Set swapInInstancePartitionStates = resourcePartitionStateOnSwapInInstance.values().stream().flatMap(e -> e.values().stream()) .collect(Collectors.toSet()); Assert.assertEquals(swapInInstancePartitionStates.size(), 1); Assert.assertTrue(swapInInstancePartitionStates.contains("OFFLINE")); - // Re-enable the SWAP_IN instance - _gSetupTool.getClusterManagementTool().enableInstance(CLUSTER_NAME, instanceToSwapInName, true); + // Validate that the SWAP_OUT instance is in routing tables and SWAP_IN is not. + validateRoutingTablesInstance(getEVs(), instanceToSwapOutName, true); + validateRoutingTablesInstance(getEVs(), instanceToSwapInName, false); + + // Assert canSwapBeCompleted is true + Assert.assertTrue(_gSetupTool.getClusterManagementTool() + .canCompleteSwap(CLUSTER_NAME, instanceToSwapOutName)); + + // Re-enable the swap out instance + _gSetupTool.getClusterManagementTool().enableInstance(CLUSTER_NAME, instanceToSwapOutName, true); Assert.assertTrue(_bestPossibleClusterVerifier.verifyByPolling()); // Validate that the SWAP_OUT instance is in routing tables and SWAP_IN is not. @@ -681,63 +677,61 @@ public void testNodeSwapDisableAndReenable() throws Exception { // Validate that the SWAP_IN instance is now in the routing tables. validateRoutingTablesInstance(getEVs(), instanceToSwapInName, true); - // Assert that SWAP_OUT instance is disabled and has no partitions assigned to it. + // Assert that swap out instance is not active and has no partitions assigned to it. Assert.assertFalse(_gSetupTool.getClusterManagementTool() .getInstanceConfig(CLUSTER_NAME, instanceToSwapOutName).getInstanceEnabled()); + Assert.assertEquals(_gSetupTool.getClusterManagementTool() + .getInstanceConfig(CLUSTER_NAME, instanceToSwapOutName).getInstanceOperation(), + InstanceConstants.InstanceOperation.UNKNOWN); - // Validate that the SWAP_IN instance has the same partitions the SWAP_OUT instance had before + // Validate that the SWAP_IN instance has the same partitions the swap out instance had before // swap was completed. verifier(() -> (validateEVsCorrect(getEVs(), originalEVs, swapOutInstancesToSwapInInstances, Collections.emptySet(), ImmutableSet.of(instanceToSwapInName))), TIMEOUT); } @Test(dependsOnMethods = "testNodeSwapDisableAndReenable") - public void testNodeSwapSwapInNodeNoInstanceOperationDisabled() throws Exception { - System.out.println( - "START TestInstanceOperation.testNodeSwapSwapInNodeNoInstanceOperationDisabled() at " + public void testNodeSwapSwapInNodeNoInstanceOperation() throws Exception { + System.out.println("START TestInstanceOperation.testNodeSwapSwapInNodeNoInstanceOperation() at " + new Date(System.currentTimeMillis())); - removeOfflineOrDisabledOrSwapInInstances(); + removeOfflineOrInactiveInstances(); // Store original EVs Map originalEVs = getEVs(); Map swapOutInstancesToSwapInInstances = new HashMap<>(); - // Set instance's InstanceOperation to SWAP_OUT String instanceToSwapOutName = _participants.get(0).getInstanceName(); InstanceConfig instanceToSwapOutInstanceConfig = _gSetupTool.getClusterManagementTool() .getInstanceConfig(CLUSTER_NAME, instanceToSwapOutName); - _gSetupTool.getClusterManagementTool().setInstanceOperation(CLUSTER_NAME, instanceToSwapOutName, - InstanceConstants.InstanceOperation.SWAP_OUT); - // Validate that the assignment has not changed since setting the InstanceOperation to SWAP_OUT Assert.assertTrue(_clusterVerifier.verifyByPolling()); validateEVsCorrect(getEVs(), originalEVs, swapOutInstancesToSwapInInstances, Collections.emptySet(), Collections.emptySet()); - // Add instance with InstanceOperation unset, should automatically be set to SWAP_IN + // Add instance with InstanceOperation unset, should set to UNKNOWN. String instanceToSwapInName = PARTICIPANT_PREFIX + "_" + _nextStartPort; swapOutInstancesToSwapInInstances.put(instanceToSwapOutName, instanceToSwapInName); addParticipant(instanceToSwapInName, instanceToSwapOutInstanceConfig.getLogicalId(LOGICAL_ID), - instanceToSwapOutInstanceConfig.getDomainAsMap().get(ZONE), null, false, -1); + instanceToSwapOutInstanceConfig.getDomainAsMap().get(ZONE), null, -1); + // Validate that the SWAP_IN instance does not have any partitions on it. Assert.assertTrue(_clusterVerifier.verifyByPolling()); validateEVsCorrect(getEVs(), originalEVs, swapOutInstancesToSwapInInstances, Collections.emptySet(), Collections.emptySet()); - // Enable the SWAP_IN instance, so it can start being assigned replicas - _gSetupTool.getClusterManagementTool().enableInstance(CLUSTER_NAME, instanceToSwapInName, true); + // Set InstanceOperation to SWAP_IN + _gSetupTool.getClusterManagementTool().setInstanceOperation(CLUSTER_NAME, instanceToSwapInName, + InstanceConstants.InstanceOperation.SWAP_IN); - // Validate that partitions on SWAP_OUT instance does not change after setting the InstanceOperation to SWAP_OUT - // and adding the SWAP_IN instance to the cluster. - // Check that the SWAP_IN instance has the same partitions as the SWAP_OUT instance + // Check that the SWAP_IN instance has the same partitions as the swap out instance // but none of them are in a top state. Assert.assertTrue(_bestPossibleClusterVerifier.verifyByPolling()); validateEVsCorrect(getEVs(), originalEVs, swapOutInstancesToSwapInInstances, ImmutableSet.of(instanceToSwapInName), Collections.emptySet()); - // Validate that the SWAP_OUT instance is in routing tables and SWAP_IN is not. + // Validate that the swap out instance is in routing tables and SWAP_IN is not. validateRoutingTablesInstance(getEVs(), instanceToSwapOutName, true); validateRoutingTablesInstance(getEVs(), instanceToSwapInName, false); @@ -750,37 +744,34 @@ public void testNodeSwapSwapInNodeNoInstanceOperationDisabled() throws Exception Assert.assertTrue(_clusterVerifier.verifyByPolling()); - // Assert that SWAP_OUT instance is disabled and has no partitions assigned to it. + // Assert that swap out instance is inactive and has no partitions assigned to it. Assert.assertFalse(_gSetupTool.getClusterManagementTool() .getInstanceConfig(CLUSTER_NAME, instanceToSwapOutName).getInstanceEnabled()); - // Validate that the SWAP_IN instance has the same partitions the SWAP_OUT instance had before + // Validate that the SWAP_IN instance has the same partitions the swap out instance had before // swap was completed. verifier(() -> (validateEVsCorrect(getEVs(), originalEVs, swapOutInstancesToSwapInInstances, Collections.emptySet(), ImmutableSet.of(instanceToSwapInName))), TIMEOUT); } - @Test(dependsOnMethods = "testNodeSwapSwapInNodeNoInstanceOperationDisabled") + @Test(dependsOnMethods = "testNodeSwapSwapInNodeNoInstanceOperation") public void testNodeSwapCancelSwapWhenReadyToComplete() throws Exception { System.out.println( "START TestInstanceOperation.testNodeSwapCancelSwapWhenReadyToComplete() at " + new Date( System.currentTimeMillis())); - removeOfflineOrDisabledOrSwapInInstances(); + removeOfflineOrInactiveInstances(); // Store original EVs Map originalEVs = getEVs(); Map swapOutInstancesToSwapInInstances = new HashMap<>(); - // Set instance's InstanceOperation to SWAP_OUT String instanceToSwapOutName = _participants.get(0).getInstanceName(); InstanceConfig instanceToSwapOutInstanceConfig = _gSetupTool.getClusterManagementTool() .getInstanceConfig(CLUSTER_NAME, instanceToSwapOutName); - _gSetupTool.getClusterManagementTool().setInstanceOperation(CLUSTER_NAME, instanceToSwapOutName, - InstanceConstants.InstanceOperation.SWAP_OUT); - // Validate that the assignment has not changed since setting the InstanceOperation to SWAP_OUT + // Validate that the assignment has not changed since setting the InstanceOperation to swap out Assert.assertTrue(_clusterVerifier.verifyByPolling()); validateEVsCorrect(getEVs(), originalEVs, swapOutInstancesToSwapInInstances, Collections.emptySet(), Collections.emptySet()); @@ -790,17 +781,15 @@ public void testNodeSwapCancelSwapWhenReadyToComplete() throws Exception { swapOutInstancesToSwapInInstances.put(instanceToSwapOutName, instanceToSwapInName); addParticipant(instanceToSwapInName, instanceToSwapOutInstanceConfig.getLogicalId(LOGICAL_ID), instanceToSwapOutInstanceConfig.getDomainAsMap().get(ZONE), - InstanceConstants.InstanceOperation.SWAP_IN, true, -1); + InstanceConstants.InstanceOperation.SWAP_IN, -1); - // Validate that partitions on SWAP_OUT instance does not change after setting the InstanceOperation to SWAP_OUT - // and adding the SWAP_IN instance to the cluster. - // Check that the SWAP_IN instance has the same partitions as the SWAP_OUT instance + // Check that the SWAP_IN instance has the same partitions as the swap out instance // but none of them are in a top state. Assert.assertTrue(_bestPossibleClusterVerifier.verifyByPolling()); validateEVsCorrect(getEVs(), originalEVs, swapOutInstancesToSwapInInstances, ImmutableSet.of(instanceToSwapInName), Collections.emptySet()); - // Validate that the SWAP_OUT instance is in routing tables and SWAP_IN is not. + // Validate that the swap out instance is in routing tables and SWAP_IN is not. validateRoutingTablesInstance(getEVs(), instanceToSwapOutName, true); validateRoutingTablesInstance(getEVs(), instanceToSwapInName, false); @@ -808,23 +797,29 @@ public void testNodeSwapCancelSwapWhenReadyToComplete() throws Exception { Assert.assertTrue(_gSetupTool.getClusterManagementTool() .canCompleteSwap(CLUSTER_NAME, instanceToSwapOutName)); - // Cancel SWAP by disabling the SWAP_IN instance and remove SWAP_OUT InstanceOperation from SWAP_OUT instance. - _gSetupTool.getClusterManagementTool() - .enableInstance(CLUSTER_NAME, instanceToSwapInName, false); + // Cancel the swap by setting the InstanceOperation to UNKNOWN on the SWAP_IN instance. + _gSetupTool.getClusterManagementTool().setInstanceOperation(CLUSTER_NAME, instanceToSwapInName, + InstanceConstants.InstanceOperation.UNKNOWN); + + // Validate there are no partitions on the SWAP_IN instance. + Assert.assertTrue(_bestPossibleClusterVerifier.verifyByPolling()); + validateEVsCorrect(getEVs(), originalEVs, swapOutInstancesToSwapInInstances, + Collections.emptySet(), Collections.emptySet()); + // Stop the participant _participants.get(_participants.size() - 1).syncStop(); // Wait for cluster to converge. Assert.assertTrue(_clusterVerifier.verifyByPolling()); - // Validate that the SWAP_OUT instance is in routing tables and SWAP_IN is not. + // Validate that the swap out instance is in routing tables and SWAP_IN is not. validateRoutingTablesInstance(getEVs(), instanceToSwapOutName, true); validateRoutingTablesInstance(getEVs(), instanceToSwapInName, false); // Validate there are no partitions on the SWAP_IN instance. Assert.assertEquals(getPartitionsAndStatesOnInstance(getEVs(), instanceToSwapInName).size(), 0); - // Validate that the SWAP_OUT instance has the same partitions as it had before. + // Validate that the swap out instance has the same partitions as it had before. validateEVsCorrect(getEVs(), originalEVs, swapOutInstancesToSwapInInstances, Collections.emptySet(), Collections.emptySet()); @@ -833,7 +828,7 @@ public void testNodeSwapCancelSwapWhenReadyToComplete() throws Exception { Assert.assertTrue(_clusterVerifier.verifyByPolling()); - // Validate that the SWAP_OUT instance has the same partitions as it had before. + // Validate that the swap out instance has the same partitions as it had before. verifier(() -> (validateEVsCorrect(getEVs(), originalEVs, swapOutInstancesToSwapInInstances, Collections.emptySet(), Collections.emptySet())), TIMEOUT); } @@ -843,7 +838,7 @@ public void testNodeSwapAfterEMM() throws Exception { System.out.println("START TestInstanceOperation.testNodeSwapAfterEMM() at " + new Date( System.currentTimeMillis())); - removeOfflineOrDisabledOrSwapInInstances(); + removeOfflineOrInactiveInstances(); // Store original EVs Map originalEVs = getEVs(); @@ -854,14 +849,11 @@ public void testNodeSwapAfterEMM() throws Exception { _gSetupTool.getClusterManagementTool() .manuallyEnableMaintenanceMode(CLUSTER_NAME, true, null, null); - // Set instance's InstanceOperation to SWAP_OUT String instanceToSwapOutName = _participants.get(0).getInstanceName(); InstanceConfig instanceToSwapOutInstanceConfig = _gSetupTool.getClusterManagementTool() .getInstanceConfig(CLUSTER_NAME, instanceToSwapOutName); - _gSetupTool.getClusterManagementTool().setInstanceOperation(CLUSTER_NAME, instanceToSwapOutName, - InstanceConstants.InstanceOperation.SWAP_OUT); - // Validate that the assignment has not changed since setting the InstanceOperation to SWAP_OUT + // Validate that the assignment has not changed. Assert.assertTrue(_clusterVerifier.verifyByPolling()); validateEVsCorrect(getEVs(), originalEVs, swapOutInstancesToSwapInInstances, Collections.emptySet(), Collections.emptySet()); @@ -871,10 +863,10 @@ public void testNodeSwapAfterEMM() throws Exception { swapOutInstancesToSwapInInstances.put(instanceToSwapOutName, instanceToSwapInName); addParticipant(instanceToSwapInName, instanceToSwapOutInstanceConfig.getLogicalId(LOGICAL_ID), instanceToSwapOutInstanceConfig.getDomainAsMap().get(ZONE), - InstanceConstants.InstanceOperation.SWAP_IN, true, -1); + InstanceConstants.InstanceOperation.SWAP_IN, -1); // Validate that the assignment has not changed since adding the SWAP_IN node. - // During MM, the cluster should not compute new assignment. + // During MM, the cluster should not compute new assignment on SWAP_IN node. Assert.assertTrue(_bestPossibleClusterVerifier.verifyByPolling()); validateEVsCorrect(getEVs(), originalEVs, swapOutInstancesToSwapInInstances, Collections.emptySet(), Collections.emptySet()); @@ -884,14 +876,14 @@ public void testNodeSwapAfterEMM() throws Exception { _gSetupTool.getClusterManagementTool() .manuallyEnableMaintenanceMode(CLUSTER_NAME, false, null, null); - // Validate that partitions on SWAP_OUT instance does not change after exiting MM - // Check that the SWAP_IN instance has the same partitions as the SWAP_OUT instance + // Validate that partitions on swap out instance does not change after exiting MM + // Check that the SWAP_IN instance has the same partitions as the swap out instance // but none of them are in a top state. Assert.assertTrue(_bestPossibleClusterVerifier.verifyByPolling()); validateEVsCorrect(getEVs(), originalEVs, swapOutInstancesToSwapInInstances, ImmutableSet.of(instanceToSwapInName), Collections.emptySet()); - // Validate that the SWAP_OUT instance is in routing tables and SWAP_IN is not. + // Validate that the swap out instance is in routing tables and SWAP_IN is not. validateRoutingTablesInstance(getEVs(), instanceToSwapOutName, true); validateRoutingTablesInstance(getEVs(), instanceToSwapInName, false); @@ -907,11 +899,11 @@ public void testNodeSwapAfterEMM() throws Exception { // Validate that the SWAP_IN instance is now in the routing tables. validateRoutingTablesInstance(getEVs(), instanceToSwapInName, true); - // Assert that SWAP_OUT instance is disabled and has no partitions assigned to it. + // Assert that swap out instance is disabled and has no partitions assigned to it. Assert.assertFalse(_gSetupTool.getClusterManagementTool() .getInstanceConfig(CLUSTER_NAME, instanceToSwapOutName).getInstanceEnabled()); - // Validate that the SWAP_IN instance has the same partitions the SWAP_OUT instance had before + // Validate that the SWAP_IN instance has the same partitions the swap out instance had before // swap was completed. verifier(() -> (validateEVsCorrect(getEVs(), originalEVs, swapOutInstancesToSwapInInstances, Collections.emptySet(), ImmutableSet.of(instanceToSwapInName))), TIMEOUT); @@ -923,28 +915,25 @@ public void testNodeSwapWithSwapOutInstanceDisabled() throws Exception { "START TestInstanceOperation.testNodeSwapWithSwapOutInstanceDisabled() at " + new Date( System.currentTimeMillis())); - removeOfflineOrDisabledOrSwapInInstances(); + removeOfflineOrInactiveInstances(); // Store original EVs Map originalEVs = getEVs(); - // Set instance's InstanceOperation to SWAP_OUT String instanceToSwapOutName = _participants.get(0).getInstanceName(); InstanceConfig instanceToSwapOutInstanceConfig = _gSetupTool.getClusterManagementTool() .getInstanceConfig(CLUSTER_NAME, instanceToSwapOutName); - _gSetupTool.getClusterManagementTool().setInstanceOperation(CLUSTER_NAME, instanceToSwapOutName, - InstanceConstants.InstanceOperation.SWAP_OUT); Set swapOutInstanceOriginalPartitions = getPartitionsAndStatesOnInstance(originalEVs, instanceToSwapOutName).keySet(); - // Disable the SWAP_OUT instance. + // Disable the swap out instance. _gSetupTool.getClusterManagementTool() .enableInstance(CLUSTER_NAME, instanceToSwapOutName, false); Assert.assertTrue(_clusterVerifier.verifyByPolling()); - // Validate that the SWAP_OUT instance has all partitions in OFFLINE state + // Validate that the swap out instance has all partitions in OFFLINE state Set swapOutInstanceOfflineStates = new HashSet<>(getPartitionsAndStatesOnInstance(getEVs(), instanceToSwapOutName).values()); Assert.assertEquals(swapOutInstanceOfflineStates.size(), 1); @@ -954,21 +943,16 @@ public void testNodeSwapWithSwapOutInstanceDisabled() throws Exception { String instanceToSwapInName = PARTICIPANT_PREFIX + "_" + _nextStartPort; addParticipant(instanceToSwapInName, instanceToSwapOutInstanceConfig.getLogicalId(LOGICAL_ID), instanceToSwapOutInstanceConfig.getDomainAsMap().get(ZONE), - InstanceConstants.InstanceOperation.SWAP_IN, true, -1); + InstanceConstants.InstanceOperation.SWAP_IN, -1); Assert.assertTrue(_bestPossibleClusterVerifier.verifyByPolling()); - // Validate that the SWAP_IN instance has the same partitions in secondTopState as the SWAP_OUT instance - // did before being disabled. + // Validate that the SWAP_IN instance has no partitions because the swap started when the swap out node was offline Map swapInInstancePartitionsAndStates = getPartitionsAndStatesOnInstance(getEVs(), instanceToSwapInName); - Assert.assertTrue( - swapInInstancePartitionsAndStates.keySet().containsAll(swapOutInstanceOriginalPartitions)); - Set swapInInstanceStates = new HashSet<>(swapInInstancePartitionsAndStates.values()); - swapInInstanceStates.removeAll(SECONDARY_STATE_SET); - Assert.assertEquals(swapInInstanceStates.size(), 0); + Assert.assertEquals(swapInInstancePartitionsAndStates.size(), 0); - // Assert canSwapBeCompleted is false because SWAP_OUT instance is disabled. + // Assert canSwapBeCompleted is false because swap out instance is disabled. Assert.assertTrue(_gSetupTool.getClusterManagementTool() .canCompleteSwap(CLUSTER_NAME, instanceToSwapOutName)); @@ -976,9 +960,9 @@ public void testNodeSwapWithSwapOutInstanceDisabled() throws Exception { Assert.assertTrue(_gSetupTool.getClusterManagementTool() .completeSwapIfPossible(CLUSTER_NAME, instanceToSwapOutName, false)); - Assert.assertTrue(_clusterVerifier.verifyByPolling()); + Assert.assertTrue(_bestPossibleClusterVerifier.verifyByPolling()); - // Assert that SWAP_OUT instance is disabled and has no partitions assigned to it. + // Assert that swap out instance is disabled and has no partitions assigned to it. Assert.assertFalse(_gSetupTool.getClusterManagementTool() .getInstanceConfig(CLUSTER_NAME, instanceToSwapOutName).getInstanceEnabled()); @@ -993,28 +977,23 @@ public void testNodeSwapWithSwapOutInstanceOffline() throws Exception { "START TestInstanceOperation.testNodeSwapWithSwapOutInstanceOffline() at " + new Date( System.currentTimeMillis())); - removeOfflineOrDisabledOrSwapInInstances(); + removeOfflineOrInactiveInstances(); // Store original EV Map originalEVs = getEVs(); Map swapOutInstancesToSwapInInstances = new HashMap<>(); - // Set instance's InstanceOperation to SWAP_OUT String instanceToSwapOutName = _participants.get(0).getInstanceName(); InstanceConfig instanceToSwapOutInstanceConfig = _gSetupTool.getClusterManagementTool() .getInstanceConfig(CLUSTER_NAME, instanceToSwapOutName); - _gSetupTool.getClusterManagementTool().setInstanceOperation(CLUSTER_NAME, instanceToSwapOutName, - InstanceConstants.InstanceOperation.SWAP_OUT); - - Assert.assertTrue(_clusterVerifier.verifyByPolling()); // Add instance with InstanceOperation set to SWAP_IN String instanceToSwapInName = PARTICIPANT_PREFIX + "_" + _nextStartPort; swapOutInstancesToSwapInInstances.put(instanceToSwapOutName, instanceToSwapInName); addParticipant(instanceToSwapInName, instanceToSwapOutInstanceConfig.getLogicalId(LOGICAL_ID), instanceToSwapOutInstanceConfig.getDomainAsMap().get(ZONE), - InstanceConstants.InstanceOperation.SWAP_IN, true, -1); + InstanceConstants.InstanceOperation.SWAP_IN, -1); // Kill the participant _participants.get(0).syncStop(); @@ -1025,7 +1004,7 @@ public void testNodeSwapWithSwapOutInstanceOffline() throws Exception { Assert.assertTrue(_gSetupTool.getClusterManagementTool() .canCompleteSwap(CLUSTER_NAME, instanceToSwapOutName)); - // Validate that the SWAP_OUT instance is in routing tables and SWAP_IN is not. + // Validate that the swap out instance is in routing tables and SWAP_IN is not. validateRoutingTablesInstance(getEVs(), instanceToSwapInName, false); // Assert completeSwapIfPossible is true @@ -1037,11 +1016,11 @@ public void testNodeSwapWithSwapOutInstanceOffline() throws Exception { // Validate that the SWAP_IN instance is now in the routing tables. validateRoutingTablesInstance(getEVs(), instanceToSwapInName, true); - // Assert that SWAP_OUT instance is disabled and has no partitions assigned to it. + // Assert that swap out instance is inactive and has no partitions assigned to it. Assert.assertFalse(_gSetupTool.getClusterManagementTool() .getInstanceConfig(CLUSTER_NAME, instanceToSwapOutName).getInstanceEnabled()); - // Validate that the SWAP_IN instance has the same partitions the SWAP_OUT instance had before + // Validate that the SWAP_IN instance has the same partitions the swap out instance had before // swap was completed. verifier(() -> (validateEVsCorrect(getEVs(), originalEVs, swapOutInstancesToSwapInInstances, Collections.emptySet(), ImmutableSet.of(instanceToSwapInName))), TIMEOUT); @@ -1051,7 +1030,7 @@ public void testNodeSwapWithSwapOutInstanceOffline() throws Exception { public void testSwapEvacuateAdd() throws Exception { System.out.println("START TestInstanceOperation.testSwapEvacuateAdd() at " + new Date( System.currentTimeMillis())); - removeOfflineOrDisabledOrSwapInInstances(); + removeOfflineOrInactiveInstances(); // Store original EV Map originalEVs = getEVs(); @@ -1062,6 +1041,8 @@ public void testSwapEvacuateAdd() throws Exception { _gSetupTool.getClusterManagementTool() .manuallyEnableMaintenanceMode(CLUSTER_NAME, true, null, null); + Assert.assertTrue(_clusterVerifier.verifyByPolling()); + // Set instance's InstanceOperation to EVACUATE String instanceToSwapOutName = _participants.get(0).getInstanceName(); InstanceConfig instanceToSwapOutInstanceConfig = _gSetupTool.getClusterManagementTool() @@ -1073,11 +1054,12 @@ public void testSwapEvacuateAdd() throws Exception { validateEVsCorrect(getEVs(), originalEVs, swapOutInstancesToSwapInInstances, Collections.emptySet(), Collections.emptySet()); - // Add instance with InstanceOperation set to SWAP_IN + // Add instance with InstanceOperation set to ENABLE String instanceToSwapInName = PARTICIPANT_PREFIX + "_" + _nextStartPort; swapOutInstancesToSwapInInstances.put(instanceToSwapOutName, instanceToSwapInName); addParticipant(instanceToSwapInName, instanceToSwapOutInstanceConfig.getLogicalId(LOGICAL_ID), - instanceToSwapOutInstanceConfig.getDomainAsMap().get(ZONE), null, true, -1); + instanceToSwapOutInstanceConfig.getDomainAsMap().get(ZONE), + InstanceConstants.InstanceOperation.ENABLE, -1); // Exit maintenance mode _gSetupTool.getClusterManagementTool() @@ -1085,7 +1067,7 @@ public void testSwapEvacuateAdd() throws Exception { Assert.assertTrue(_bestPossibleClusterVerifier.verifyByPolling()); - // Validate that the SWAP_IN instance has the same partitions the SWAP_OUT instance had. + // Validate that the SWAP_IN instance has the same partitions the swap out instance had. verifier(() -> (validateEVsCorrect(getEVs(), originalEVs, swapOutInstancesToSwapInInstances, Collections.emptySet(), ImmutableSet.of(instanceToSwapInName))), TIMEOUT); @@ -1093,9 +1075,9 @@ public void testSwapEvacuateAdd() throws Exception { Assert.assertTrue(_gSetupTool.getClusterManagementTool() .isEvacuateFinished(CLUSTER_NAME, instanceToSwapOutName)); - // Disable the EVACUATE instance - _gSetupTool.getClusterManagementTool() - .enableInstance(CLUSTER_NAME, instanceToSwapOutName, false); + // Set the EVACUATE instance to UNKNOWN + _gSetupTool.getClusterManagementTool().setInstanceOperation(CLUSTER_NAME, instanceToSwapOutName, + InstanceConstants.InstanceOperation.UNKNOWN); Assert.assertTrue(_clusterVerifier.verifyByPolling()); @@ -1105,54 +1087,13 @@ public void testSwapEvacuateAdd() throws Exception { } @Test(expectedExceptions = HelixException.class, dependsOnMethods = "testSwapEvacuateAdd") - public void testNodeSwapAddSwapInFirstEnabledBeforeSwapOutSet() throws Exception { - System.out.println( - "START TestInstanceOperation.testNodeSwapAddSwapInFirstEnabledBeforeSwapOutSet() at " - + new Date(System.currentTimeMillis())); - removeOfflineOrDisabledOrSwapInInstances(); - - // Get the SWAP_OUT instance. - String instanceToSwapOutName = _participants.get(0).getInstanceName(); - InstanceConfig instanceToSwapOutInstanceConfig = _gSetupTool.getClusterManagementTool() - .getInstanceConfig(CLUSTER_NAME, instanceToSwapOutName); - - // Add instance with InstanceOperation set to SWAP_IN enabled before setting SWAP_OUT instance. - String instanceToSwapInName = PARTICIPANT_PREFIX + "_" + _nextStartPort; - addParticipant(instanceToSwapInName, instanceToSwapOutInstanceConfig.getLogicalId(LOGICAL_ID), - instanceToSwapOutInstanceConfig.getDomainAsMap().get(ZONE), null, true, -1); - } - - @Test(expectedExceptions = HelixException.class, dependsOnMethods = "testNodeSwapAddSwapInFirstEnabledBeforeSwapOutSet") - public void testNodeSwapAddSwapInFirstEnableBeforeSwapOutSet() throws Exception { + public void testUnsetInstanceOperationOnSwapInWhenSwapping() throws Exception { System.out.println( - "START TestInstanceOperation.testNodeSwapAddSwapInFirstEnableBeforeSwapOutSet() at " + "START TestInstanceOperation.testUnsetInstanceOperationOnSwapInWhenSwapping() at " + new Date(System.currentTimeMillis())); - removeOfflineOrDisabledOrSwapInInstances(); - - // Get the SWAP_OUT instance. - String instanceToSwapOutName = _participants.get(0).getInstanceName(); - InstanceConfig instanceToSwapOutInstanceConfig = _gSetupTool.getClusterManagementTool() - .getInstanceConfig(CLUSTER_NAME, instanceToSwapOutName); - - // Add instance with InstanceOperation set to SWAP_IN - String instanceToSwapInName = PARTICIPANT_PREFIX + "_" + _nextStartPort; - addParticipant(instanceToSwapInName, instanceToSwapOutInstanceConfig.getLogicalId(LOGICAL_ID), - instanceToSwapOutInstanceConfig.getDomainAsMap().get(ZONE), null, false, -1); - Assert.assertTrue(_bestPossibleClusterVerifier.verifyByPolling()); - - // Enable the SWAP_IN instance before we have set the SWAP_OUT instance. - _gSetupTool.getClusterManagementTool().enableInstance(CLUSTER_NAME, instanceToSwapInName, true); - } - - @Test(expectedExceptions = HelixException.class, dependsOnMethods = "testNodeSwapAddSwapInFirstEnableBeforeSwapOutSet") - public void testUnsetInstanceOperationOnSwapInWhenAlreadyUnsetOnSwapOut() throws Exception { - System.out.println( - "START TestInstanceOperation.testUnsetInstanceOperationOnSwapInWhenAlreadyUnsetOnSwapOut() at " - + new Date(System.currentTimeMillis())); - removeOfflineOrDisabledOrSwapInInstances(); + removeOfflineOrInactiveInstances(); - // Get the SWAP_OUT instance. String instanceToSwapOutName = _participants.get(0).getInstanceName(); InstanceConfig instanceToSwapOutInstanceConfig = _gSetupTool.getClusterManagementTool() .getInstanceConfig(CLUSTER_NAME, instanceToSwapOutName); @@ -1160,26 +1101,27 @@ public void testUnsetInstanceOperationOnSwapInWhenAlreadyUnsetOnSwapOut() throws // Add instance with InstanceOperation set to SWAP_IN String instanceToSwapInName = PARTICIPANT_PREFIX + "_" + _nextStartPort; addParticipant(instanceToSwapInName, instanceToSwapOutInstanceConfig.getLogicalId(LOGICAL_ID), - instanceToSwapOutInstanceConfig.getDomainAsMap().get(ZONE), null, false, -1); + instanceToSwapOutInstanceConfig.getDomainAsMap().get(ZONE), + InstanceConstants.InstanceOperation.SWAP_IN, -1); Assert.assertTrue(_bestPossibleClusterVerifier.verifyByPolling()); - // Try to remove the InstanceOperation from the SWAP_IN instance before the SWAP_OUT instance is set. + // Try to remove the InstanceOperation from the SWAP_IN instance before swap in instance is set to unknown. // This should throw exception because we cannot ever have two instances with the same logicalId and both have InstanceOperation // unset. _gSetupTool.getClusterManagementTool() .setInstanceOperation(CLUSTER_NAME, instanceToSwapInName, null); } - @Test(expectedExceptions = HelixException.class, dependsOnMethods = "testUnsetInstanceOperationOnSwapInWhenAlreadyUnsetOnSwapOut") + @Test(dependsOnMethods = "testUnsetInstanceOperationOnSwapInWhenSwapping") public void testNodeSwapAddSwapInFirst() throws Exception { System.out.println("START TestInstanceOperation.testNodeSwapAddSwapInFirst() at " + new Date( System.currentTimeMillis())); - removeOfflineOrDisabledOrSwapInInstances(); + removeOfflineOrInactiveInstances(); // Store original EV Map originalEVs = getEVs(); - // Get the SWAP_OUT instance. + // Get the swap out instance. String instanceToSwapOutName = _participants.get(0).getInstanceName(); InstanceConfig instanceToSwapOutInstanceConfig = _gSetupTool.getClusterManagementTool() .getInstanceConfig(CLUSTER_NAME, instanceToSwapOutName); @@ -1187,7 +1129,8 @@ public void testNodeSwapAddSwapInFirst() throws Exception { // Add instance with InstanceOperation set to SWAP_IN String instanceToSwapInName = PARTICIPANT_PREFIX + "_" + _nextStartPort; addParticipant(instanceToSwapInName, instanceToSwapOutInstanceConfig.getLogicalId(LOGICAL_ID), - instanceToSwapOutInstanceConfig.getDomainAsMap().get(ZONE), null, false, -1); + instanceToSwapOutInstanceConfig.getDomainAsMap().get(ZONE), + InstanceConstants.InstanceOperation.SWAP_IN, -1); } @Test(dependsOnMethods = "testNodeSwapAddSwapInFirst") @@ -1195,7 +1138,8 @@ public void testEvacuateAndCancelBeforeBootstrapFinish() throws Exception { System.out.println( "START TestInstanceOperation.testEvacuateAndCancelBeforeBootstrapFinish() at " + new Date( System.currentTimeMillis())); - removeOfflineOrDisabledOrSwapInInstances(); + + removeOfflineOrInactiveInstances(); // add a resource where downward state transition is slow createResourceWithDelayedRebalance(CLUSTER_NAME, "TEST_DB3_DELAYED_CRUSHED", "MasterSlave", @@ -1346,7 +1290,31 @@ public void testMarkEvacuationAfterEMM() throws Exception { _stateModelDelay = 3L; } - @Test(dependsOnMethods = "testMarkEvacuationAfterEMM") + @Test(expectedExceptions = HelixException.class, dependsOnMethods = "testMarkEvacuationAfterEMM") + public void testSwapEvacuateAddRemoveEvacuate() throws Exception { + System.out.println("START TestInstanceOperation.testSwapEvacuateAddRemoveEvacuate() at " + new Date( + System.currentTimeMillis())); + removeOfflineOrInactiveInstances(); + + // Set instance's InstanceOperation to EVACUATE + String instanceToSwapOutName = _participants.get(0).getInstanceName(); + InstanceConfig instanceToSwapOutInstanceConfig = _gSetupTool.getClusterManagementTool() + .getInstanceConfig(CLUSTER_NAME, instanceToSwapOutName); + _gSetupTool.getClusterManagementTool().setInstanceOperation(CLUSTER_NAME, instanceToSwapOutName, + InstanceConstants.InstanceOperation.EVACUATE); + + // Add instance with InstanceOperation set to ENABLE + String instanceToSwapInName = PARTICIPANT_PREFIX + "_" + _nextStartPort; + addParticipant(instanceToSwapInName, instanceToSwapOutInstanceConfig.getLogicalId(LOGICAL_ID), + instanceToSwapOutInstanceConfig.getDomainAsMap().get(ZONE), + InstanceConstants.InstanceOperation.ENABLE, -1); + + // Remove EVACUATE instance's InstanceOperation + _gSetupTool.getClusterManagementTool() + .setInstanceOperation(CLUSTER_NAME, instanceToSwapOutName, null); + } + + @Test(dependsOnMethods = "testSwapEvacuateAddRemoveEvacuate") public void testEvacuationWithOfflineInstancesInCluster() throws Exception { System.out.println( "START TestInstanceOperation.testEvacuationWithOfflineInstancesInCluster() at " + new Date( @@ -1358,11 +1326,10 @@ public void testEvacuationWithOfflineInstancesInCluster() throws Exception { _gSetupTool.getClusterManagementTool().setInstanceOperation(CLUSTER_NAME, evacuateInstanceName, InstanceConstants.InstanceOperation.EVACUATE); - Map assignment; // EV should contain all participants, check resources one by one - assignment = getEVs(); - for (String resource : _allDBs) { - verifier(() -> { + verifier(() -> { + Map assignment = getEVs(); + for (String resource : _allDBs) { ExternalView ev = assignment.get(resource); for (String partition : ev.getPartitionSet()) { AtomicInteger activeReplicaCount = new AtomicInteger(); @@ -1372,44 +1339,21 @@ public void testEvacuationWithOfflineInstancesInCluster() throws Exception { .forEach(v -> activeReplicaCount.getAndIncrement()); if (activeReplicaCount.get() < REPLICA - 1 || ( ev.getStateMap(partition).containsKey(evacuateInstanceName) && ev.getStateMap( - partition).get(evacuateInstanceName).equals("MASTER") && ev.getStateMap(partition) - .get(evacuateInstanceName).equals("LEADER"))) { + partition).get(evacuateInstanceName).equals("MASTER") && ev.getStateMap( + partition).get(evacuateInstanceName).equals("LEADER"))) { return false; } } - return true; - }, 30000); - } + } + return true; + }, 30000); - removeOfflineOrDisabledOrSwapInInstances(); + removeOfflineOrInactiveInstances(); addParticipant(PARTICIPANT_PREFIX + "_" + _nextStartPort); addParticipant(PARTICIPANT_PREFIX + "_" + _nextStartPort); dropTestDBs(ImmutableSet.of("TEST_DB3_DELAYED_CRUSHED", "TEST_DB4_DELAYED_WAGED")); } - @Test(expectedExceptions = HelixException.class, dependsOnMethods = "testEvacuationWithOfflineInstancesInCluster") - public void testSwapEvacuateAddRemoveEvacuate() throws Exception { - System.out.println("START TestInstanceOperation.testSwapEvacuateAddRemoveEvacuate() at " + new Date( - System.currentTimeMillis())); - removeOfflineOrDisabledOrSwapInInstances(); - - // Set instance's InstanceOperation to EVACUATE - String instanceToSwapOutName = _participants.get(0).getInstanceName(); - InstanceConfig instanceToSwapOutInstanceConfig = _gSetupTool.getClusterManagementTool() - .getInstanceConfig(CLUSTER_NAME, instanceToSwapOutName); - _gSetupTool.getClusterManagementTool().setInstanceOperation(CLUSTER_NAME, instanceToSwapOutName, - InstanceConstants.InstanceOperation.EVACUATE); - - // Add instance with InstanceOperation set to SWAP_IN - String instanceToSwapInName = PARTICIPANT_PREFIX + "_" + _nextStartPort; - addParticipant(instanceToSwapInName, instanceToSwapOutInstanceConfig.getLogicalId(LOGICAL_ID), - instanceToSwapOutInstanceConfig.getDomainAsMap().get(ZONE), null, true, -1); - - // Remove EVACUATE instance's InstanceOperation - _gSetupTool.getClusterManagementTool() - .setInstanceOperation(CLUSTER_NAME, instanceToSwapOutName, null); - } - /** * Verifies that the given verifier returns true within the given timeout. Handles AssertionError * by returning false, which TestHelper.verify will not do. Asserts that return value from @@ -1449,9 +1393,9 @@ public boolean isThrottlesEnabled() { public void onInstanceConfigChange(List instanceConfig, NotificationContext context) { if (instanceConfig.get(0).getInstanceOperation() - .equals(InstanceConstants.InstanceOperation.SWAP_IN.name())) { + .equals(InstanceConstants.InstanceOperation.SWAP_IN)) { throttlesEnabled = false; - } else if (instanceConfig.get(0).getInstanceOperation().isEmpty()) { + } else { throttlesEnabled = true; } } @@ -1470,21 +1414,21 @@ private MockParticipantManager createParticipant(String participantName) throws private void addParticipant(String participantName) throws Exception { addParticipant(participantName, UUID.randomUUID().toString(), - "zone_" + _participants.size() % ZONE_COUNT, null, true, -1); + "zone_" + _participants.size() % ZONE_COUNT, null, -1); } private void addParticipant(String participantName, String logicalId, String zone, - InstanceConstants.InstanceOperation instanceOperation, boolean enabled, int capacity) + InstanceConstants.InstanceOperation instanceOperation, int capacity) throws Exception { - addParticipant(participantName, logicalId, zone, instanceOperation, enabled, capacity, null); + addParticipant(participantName, logicalId, zone, instanceOperation, capacity, null); } private void addParticipant(String participantName, String logicalId, String zone, - InstanceConstants.InstanceOperation instanceOperation, boolean enabled, int capacity, + InstanceConstants.InstanceOperation instanceOperation, int capacity, InstanceConfigChangeListener listener) throws Exception { InstanceConfig config = new InstanceConfig.Builder().setDomain( String.format("%s=%s, %s=%s, %s=%s", ZONE, zone, HOST, participantName, LOGICAL_ID, - logicalId)).setInstanceEnabled(enabled).setInstanceOperation(instanceOperation) + logicalId)).setInstanceOperation(instanceOperation) .build(participantName); if (capacity >= 0) { diff --git a/helix-core/src/test/java/org/apache/helix/integration/rebalancer/WagedRebalancer/TestWagedExpandCluster.java b/helix-core/src/test/java/org/apache/helix/integration/rebalancer/WagedRebalancer/TestWagedExpandCluster.java index 697847c29c..9cc4eea52f 100644 --- a/helix-core/src/test/java/org/apache/helix/integration/rebalancer/WagedRebalancer/TestWagedExpandCluster.java +++ b/helix-core/src/test/java/org/apache/helix/integration/rebalancer/WagedRebalancer/TestWagedExpandCluster.java @@ -22,6 +22,7 @@ import java.util.HashMap; import java.util.Map; +import org.apache.helix.constants.InstanceConstants; import org.apache.helix.integration.manager.MockParticipantManager; import org.apache.helix.integration.rebalancer.PartitionMigration.TestPartitionMigrationBase; import org.apache.helix.model.ClusterConfig; @@ -103,7 +104,7 @@ public void testClusterExpansionByEnableInstance() for (int i = numNodes; i < numNodes + NUM_NODE; i++) { String storageNodeName = PARTICIPANT_PREFIX + "_" + (START_PORT + i); InstanceConfig config = InstanceConfig.toInstanceConfig(storageNodeName); - config.setInstanceEnabled(false); + config.setInstanceOperation(InstanceConstants.InstanceOperation.DISABLE); config.getRecord().getSimpleFields() .remove(InstanceConfig.InstanceConfigProperty.HELIX_ENABLED_TIMESTAMP.name()); diff --git a/helix-core/src/test/java/org/apache/helix/integration/rebalancer/WagedRebalancer/TestWagedNodeSwap.java b/helix-core/src/test/java/org/apache/helix/integration/rebalancer/WagedRebalancer/TestWagedNodeSwap.java index f6ef8279dc..fbb7304509 100644 --- a/helix-core/src/test/java/org/apache/helix/integration/rebalancer/WagedRebalancer/TestWagedNodeSwap.java +++ b/helix-core/src/test/java/org/apache/helix/integration/rebalancer/WagedRebalancer/TestWagedNodeSwap.java @@ -32,6 +32,7 @@ import org.apache.helix.ConfigAccessor; import org.apache.helix.TestHelper; import org.apache.helix.common.ZkTestBase; +import org.apache.helix.constants.InstanceConstants; import org.apache.helix.integration.manager.ClusterControllerManager; import org.apache.helix.integration.manager.MockParticipantManager; import org.apache.helix.model.BuiltInStateModelDefinitions; @@ -156,7 +157,7 @@ public void testNodeSwap() throws Exception { String oldParticipantName = oldParticipant.getInstanceName(); final InstanceConfig instanceConfig = _gSetupTool.getClusterManagementTool().getInstanceConfig(CLUSTER_NAME, oldParticipantName); - instanceConfig.setInstanceEnabled(false); + instanceConfig.setInstanceOperation(InstanceConstants.InstanceOperation.DISABLE); _gSetupTool.getClusterManagementTool() .setInstanceConfig(CLUSTER_NAME, oldParticipantName, instanceConfig); Assert.assertTrue(_clusterVerifier.verify(10000)); @@ -231,7 +232,7 @@ public void testFaultZoneSwap() throws Exception { InstanceConfig instanceConfig = _gSetupTool.getClusterManagementTool().getInstanceConfig(CLUSTER_NAME, instanceName); if (instanceConfig.getDomainAsMap().get("zone").equals(randZoneStr)) { - instanceConfig.setInstanceEnabled(false); + instanceConfig.setInstanceOperation(InstanceConstants.InstanceOperation.DISABLE); _gSetupTool.getClusterManagementTool() .setInstanceConfig(CLUSTER_NAME, instanceName, instanceConfig); removedInstanceConfigMap.put(instanceName, instanceConfig); diff --git a/helix-core/src/test/java/org/apache/helix/integration/rebalancer/WagedRebalancer/TestWagedRebalance.java b/helix-core/src/test/java/org/apache/helix/integration/rebalancer/WagedRebalancer/TestWagedRebalance.java index a7250d4804..26eb13d7a6 100644 --- a/helix-core/src/test/java/org/apache/helix/integration/rebalancer/WagedRebalancer/TestWagedRebalance.java +++ b/helix-core/src/test/java/org/apache/helix/integration/rebalancer/WagedRebalancer/TestWagedRebalance.java @@ -34,6 +34,7 @@ import org.apache.helix.HelixException; import org.apache.helix.TestHelper; import org.apache.helix.common.ZkTestBase; +import org.apache.helix.constants.InstanceConstants; import org.apache.helix.controller.rebalancer.strategy.CrushEdRebalanceStrategy; import org.apache.helix.controller.rebalancer.strategy.CrushRebalanceStrategy; import org.apache.helix.controller.rebalancer.util.RebalanceScheduler; @@ -387,7 +388,7 @@ public void testDisableInstance() throws InterruptedException { disableParticipants.add(p.getInstanceName()); InstanceConfig config = _gSetupTool.getClusterManagementTool() .getInstanceConfig(CLUSTER_NAME, p.getInstanceName()); - config.setInstanceEnabled(false); + config.setInstanceOperation(InstanceConstants.InstanceOperation.DISABLE); _gSetupTool.getClusterManagementTool() .setInstanceConfig(CLUSTER_NAME, p.getInstanceName(), config); } @@ -408,7 +409,7 @@ public void testDisableInstance() throws InterruptedException { for (String instanceName : disableParticipants) { InstanceConfig config = _gSetupTool.getClusterManagementTool().getInstanceConfig(CLUSTER_NAME, instanceName); - config.setInstanceEnabled(true); + config.setInstanceOperation(InstanceConstants.InstanceOperation.ENABLE); _gSetupTool.getClusterManagementTool() .setInstanceConfig(CLUSTER_NAME, instanceName, config); } diff --git a/helix-core/src/test/java/org/apache/helix/messaging/p2pMessage/TestP2PMessagesAvoidDuplicatedMessage.java b/helix-core/src/test/java/org/apache/helix/messaging/p2pMessage/TestP2PMessagesAvoidDuplicatedMessage.java index 40d5c9774d..77f4432352 100644 --- a/helix-core/src/test/java/org/apache/helix/messaging/p2pMessage/TestP2PMessagesAvoidDuplicatedMessage.java +++ b/helix-core/src/test/java/org/apache/helix/messaging/p2pMessage/TestP2PMessagesAvoidDuplicatedMessage.java @@ -82,6 +82,7 @@ private void preSetup() throws Exception { event.addAttribute(AttributeName.RESOURCES.name(), resourceMap); event.addAttribute(AttributeName.RESOURCES_TO_REBALANCE.name(), resourceMap); event.addAttribute(AttributeName.CURRENT_STATE.name(), new CurrentStateOutput()); + event.addAttribute(AttributeName.CURRENT_STATE_EXCLUDING_UNKNOWN.name(), new CurrentStateOutput()); event.addAttribute(AttributeName.helixmanager.name(), manager); _fullPipeline = new Pipeline("FullPipeline"); @@ -124,6 +125,7 @@ public void testP2PAvoidDuplicatedMessage() throws Exception { CurrentStateOutput currentStateOutput = populateCurrentStateFromBestPossible(_bestpossibleState); event.addAttribute(AttributeName.CURRENT_STATE.name(), currentStateOutput); + event.addAttribute(AttributeName.CURRENT_STATE_EXCLUDING_UNKNOWN.name(), currentStateOutput); _fullPipeline.handle(event); @@ -161,6 +163,7 @@ public void testP2PAvoidDuplicatedMessage() throws Exception { currentStateOutput.setPendingRelayMessage(_db, _partition, initialMaster, relayMessage); event.addAttribute(AttributeName.CURRENT_STATE.name(), currentStateOutput); + event.addAttribute(AttributeName.CURRENT_STATE_EXCLUDING_UNKNOWN.name(), currentStateOutput); _fullPipeline.handle(event); @@ -179,6 +182,7 @@ public void testP2PAvoidDuplicatedMessage() throws Exception { currentStateOutput.setPendingRelayMessage(_db, _partition, initialMaster, relayMessage); event.addAttribute(AttributeName.CURRENT_STATE.name(), currentStateOutput); + event.addAttribute(AttributeName.CURRENT_STATE_EXCLUDING_UNKNOWN.name(), currentStateOutput); _messagePipeline.handle(event); @@ -218,6 +222,7 @@ public void testP2PAvoidDuplicatedMessage() throws Exception { event.addAttribute(AttributeName.CURRENT_STATE.name(), currentStateOutput); + event.addAttribute(AttributeName.CURRENT_STATE_EXCLUDING_UNKNOWN.name(), currentStateOutput); event.addAttribute(AttributeName.BEST_POSSIBLE_STATE.name(), _bestpossibleState); _messagePipeline.handle(event); @@ -244,6 +249,7 @@ public void testP2PAvoidDuplicatedMessage() throws Exception { currentStateOutput.setCurrentState(_db, _partition, initialMaster, "SLAVE"); currentStateOutput.setPendingMessage(_db, _partition, secondMaster, relayMessage); event.addAttribute(AttributeName.CURRENT_STATE.name(), currentStateOutput); + event.addAttribute(AttributeName.CURRENT_STATE_EXCLUDING_UNKNOWN.name(), currentStateOutput); _fullPipeline.handle(event); @@ -264,6 +270,7 @@ public void testP2PAvoidDuplicatedMessage() throws Exception { // Validate: controller should not send S->M to thirdMaster. currentStateOutput.setCurrentState(_db, _partition, initialMaster, "OFFLINE"); event.addAttribute(AttributeName.CURRENT_STATE.name(), currentStateOutput); + event.addAttribute(AttributeName.CURRENT_STATE_EXCLUDING_UNKNOWN.name(), currentStateOutput); thirdMaster = getTopStateInstance(_bestpossibleState.getInstanceStateMap(_db, _partition), @@ -290,6 +297,7 @@ public void testP2PAvoidDuplicatedMessage() throws Exception { // Validate: Controller should not send S->M to thirdMaster. currentStateOutput.setPendingMessage(_db, _partition, secondMaster, relayMessage); event.addAttribute(AttributeName.CURRENT_STATE.name(), currentStateOutput); + event.addAttribute(AttributeName.CURRENT_STATE_EXCLUDING_UNKNOWN.name(), currentStateOutput); event.addAttribute(AttributeName.BEST_POSSIBLE_STATE.name(), _bestpossibleState); @@ -310,6 +318,7 @@ public void testP2PAvoidDuplicatedMessage() throws Exception { currentStateOutput.setCurrentState(_db, _partition, thirdMaster, "SLAVE"); event.addAttribute(AttributeName.CURRENT_STATE.name(), currentStateOutput); + event.addAttribute(AttributeName.CURRENT_STATE_EXCLUDING_UNKNOWN.name(), currentStateOutput); _messagePipeline.handle(event); diff --git a/helix-core/src/test/java/org/apache/helix/messaging/p2pMessage/TestP2PStateTransitionMessages.java b/helix-core/src/test/java/org/apache/helix/messaging/p2pMessage/TestP2PStateTransitionMessages.java index 307022f3fb..9a38656953 100644 --- a/helix-core/src/test/java/org/apache/helix/messaging/p2pMessage/TestP2PStateTransitionMessages.java +++ b/helix-core/src/test/java/org/apache/helix/messaging/p2pMessage/TestP2PStateTransitionMessages.java @@ -95,6 +95,7 @@ public void testAvoidDuplicatedMessageWithP2PEnabled() throws Exception { event.addAttribute(AttributeName.RESOURCES.name(), resourceMap); event.addAttribute(AttributeName.RESOURCES_TO_REBALANCE.name(), resourceMap); event.addAttribute(AttributeName.CURRENT_STATE.name(), new CurrentStateOutput()); + event.addAttribute(AttributeName.CURRENT_STATE_EXCLUDING_UNKNOWN.name(), new CurrentStateOutput()); event.addAttribute(AttributeName.helixmanager.name(), manager); Pipeline pipeline = createPipeline(); @@ -106,6 +107,7 @@ public void testAvoidDuplicatedMessageWithP2PEnabled() throws Exception { CurrentStateOutput currentStateOutput = populateCurrentStateFromBestPossible(bestPossibleStateOutput); event.addAttribute(AttributeName.CURRENT_STATE.name(), currentStateOutput); + event.addAttribute(AttributeName.CURRENT_STATE_EXCLUDING_UNKNOWN.name(), currentStateOutput); Partition p = new Partition(db + "_0"); @@ -153,6 +155,7 @@ public void testAvoidDuplicatedMessageWithP2PEnabled() throws Exception { currentStateOutput.setPendingRelayMessage(db, p, masterInstance, relayMessage); event.addAttribute(AttributeName.CURRENT_STATE.name(), currentStateOutput); + event.addAttribute(AttributeName.CURRENT_STATE_EXCLUDING_UNKNOWN.name(), currentStateOutput); pipeline.handle(event); @@ -167,6 +170,7 @@ public void testAvoidDuplicatedMessageWithP2PEnabled() throws Exception { currentStateOutput.setCurrentState(db, p, masterInstance, "SLAVE"); currentStateOutput.setPendingMessage(db, p, newMasterInstance, relayMessage); event.addAttribute(AttributeName.CURRENT_STATE.name(), currentStateOutput); + event.addAttribute(AttributeName.CURRENT_STATE_EXCLUDING_UNKNOWN.name(), currentStateOutput); pipeline.handle(event); @@ -186,6 +190,7 @@ public void testAvoidDuplicatedMessageWithP2PEnabled() throws Exception { // but controller should not send S->M to newly calculated master. currentStateOutput.setCurrentState(db, p, masterInstance, "OFFLINE"); event.addAttribute(AttributeName.CURRENT_STATE.name(), currentStateOutput); + event.addAttribute(AttributeName.CURRENT_STATE_EXCLUDING_UNKNOWN.name(), currentStateOutput); String slaveInstance = getTopStateInstance(bestPossibleStateOutput.getInstanceStateMap(db, p), @@ -217,6 +222,7 @@ public void testAvoidDuplicatedMessageWithP2PEnabled() throws Exception { // Controller will not send S->M to new master. currentStateOutput.setPendingMessage(db, p, newMasterInstance, relayMessage); event.addAttribute(AttributeName.CURRENT_STATE.name(), currentStateOutput); + event.addAttribute(AttributeName.CURRENT_STATE_EXCLUDING_UNKNOWN.name(), currentStateOutput); event.addAttribute(AttributeName.BEST_POSSIBLE_STATE.name(), bestPossibleStateOutput); event.addAttribute(AttributeName.INTERMEDIATE_STATE.name(), bestPossibleStateOutput); @@ -244,6 +250,7 @@ public void testAvoidDuplicatedMessageWithP2PEnabled() throws Exception { currentStateOutput.setCurrentState(db, p, slaveInstance, "SLAVE"); event.addAttribute(AttributeName.CURRENT_STATE.name(), currentStateOutput); + event.addAttribute(AttributeName.CURRENT_STATE_EXCLUDING_UNKNOWN.name(), currentStateOutput); pipeline = new Pipeline("test"); pipeline.addStage(new MessageGenerationPhase()); @@ -271,6 +278,7 @@ private void testP2PMessage(ClusterConfig clusterConfig, Boolean p2pMessageEnabl event.addAttribute(AttributeName.RESOURCES.name(), resourceMap); event.addAttribute(AttributeName.RESOURCES_TO_REBALANCE.name(), resourceMap); event.addAttribute(AttributeName.CURRENT_STATE.name(), new CurrentStateOutput()); + event.addAttribute(AttributeName.CURRENT_STATE_EXCLUDING_UNKNOWN.name(), new CurrentStateOutput()); event.addAttribute(AttributeName.helixmanager.name(), manager); event.addAttribute(AttributeName.ControllerDataProvider.name(), new ResourceControllerDataProvider()); @@ -284,6 +292,7 @@ private void testP2PMessage(ClusterConfig clusterConfig, Boolean p2pMessageEnabl CurrentStateOutput currentStateOutput = populateCurrentStateFromBestPossible(bestPossibleStateOutput); event.addAttribute(AttributeName.CURRENT_STATE.name(), currentStateOutput); + event.addAttribute(AttributeName.CURRENT_STATE_EXCLUDING_UNKNOWN.name(), currentStateOutput); Partition p = new Partition(db + "_0"); diff --git a/helix-core/src/test/java/org/apache/helix/messaging/p2pMessage/TestP2PWithStateCancellationMessage.java b/helix-core/src/test/java/org/apache/helix/messaging/p2pMessage/TestP2PWithStateCancellationMessage.java index ea2a4aa38d..9f60a4a7fb 100644 --- a/helix-core/src/test/java/org/apache/helix/messaging/p2pMessage/TestP2PWithStateCancellationMessage.java +++ b/helix-core/src/test/java/org/apache/helix/messaging/p2pMessage/TestP2PWithStateCancellationMessage.java @@ -165,6 +165,7 @@ private ClusterEvent generateClusterEvent() { currentStateOutput.setCurrentState(RESOURCE_NAME, new Partition("0"), "localhost_2", "SLAVE"); currentStateOutput.setCurrentState(RESOURCE_NAME, new Partition("1"), "localhost_2", "MASTER"); event.addAttribute(AttributeName.CURRENT_STATE.name(), currentStateOutput); + event.addAttribute(AttributeName.CURRENT_STATE_EXCLUDING_UNKNOWN.name(), currentStateOutput); BestPossibleStateOutput bestPossibleStateOutput = new BestPossibleStateOutput(); bestPossibleStateOutput.setState(RESOURCE_NAME, new Partition("0"), "localhost_1", "SLAVE"); diff --git a/helix-core/src/test/java/org/apache/helix/mock/MockHelixAdmin.java b/helix-core/src/test/java/org/apache/helix/mock/MockHelixAdmin.java index 60ee9b6257..9a1311b1c3 100644 --- a/helix-core/src/test/java/org/apache/helix/mock/MockHelixAdmin.java +++ b/helix-core/src/test/java/org/apache/helix/mock/MockHelixAdmin.java @@ -266,11 +266,13 @@ public void removeFromIdealState(String clusterName, String resourceName, IdealS } + @Deprecated @Override public void enableInstance(String clusterName, String instanceName, boolean enabled) { enableInstance(clusterName, instanceName, enabled, null, null); } + @Deprecated @Override public void enableInstance(String clusterName, String instanceName, boolean enabled, InstanceConstants.InstanceDisabledType disabledType, String reason) { @@ -283,7 +285,8 @@ public void enableInstance(String clusterName, String instanceName, boolean enab ZNRecord record = (ZNRecord) _baseDataAccessor.get(instanceConfigPath, null, 0); InstanceConfig instanceConfig = new InstanceConfig(record); - instanceConfig.setInstanceEnabled(enabled); + instanceConfig.setInstanceOperation(enabled ? InstanceConstants.InstanceOperation.ENABLE + : InstanceConstants.InstanceOperation.DISABLE); if (!enabled) { instanceConfig.resetInstanceDisabledTypeAndReason(); if (reason != null) { @@ -296,6 +299,7 @@ public void enableInstance(String clusterName, String instanceName, boolean enab _baseDataAccessor.set(instanceConfigPath, instanceConfig.getRecord(), 0); } + @Deprecated @Override public void enableInstance(String clusterName, List instances, boolean enabled) { diff --git a/helix-core/src/test/java/org/apache/helix/model/TestInstanceConfig.java b/helix-core/src/test/java/org/apache/helix/model/TestInstanceConfig.java index b8e6569f5d..7da983b8aa 100644 --- a/helix-core/src/test/java/org/apache/helix/model/TestInstanceConfig.java +++ b/helix-core/src/test/java/org/apache/helix/model/TestInstanceConfig.java @@ -51,7 +51,7 @@ public void testGetParsedDomain() { @Test public void testSetInstanceEnableWithReason() { InstanceConfig instanceConfig = new InstanceConfig(new ZNRecord("id")); - instanceConfig.setInstanceEnabled(true); + instanceConfig.setInstanceOperation(InstanceConstants.InstanceOperation.ENABLE); instanceConfig.setInstanceDisabledReason("NoShowReason"); instanceConfig.setInstanceDisabledType(InstanceConstants.InstanceDisabledType.USER_OPERATION); @@ -63,7 +63,7 @@ public void testSetInstanceEnableWithReason() { .get(InstanceConfig.InstanceConfigProperty.HELIX_DISABLED_TYPE.toString()), null); - instanceConfig.setInstanceEnabled(false); + instanceConfig.setInstanceOperation(InstanceConstants.InstanceOperation.DISABLE); String reasonCode = "ReasonCode"; instanceConfig.setInstanceDisabledReason(reasonCode); instanceConfig.setInstanceDisabledType(InstanceConstants.InstanceDisabledType.USER_OPERATION); @@ -197,4 +197,45 @@ public void testInstanceConfigBuilder() { Assert.assertEquals(instanceConfig.getInstanceInfoMap().get("CABINET"), "30"); Assert.assertEquals(instanceConfig.getInstanceCapacityMap().get("weight1"), Integer.valueOf(1)); } + + @Test + public void testOverwriteInstanceConfig() { + InstanceConfig instanceConfig = new InstanceConfig("instance2"); + instanceConfig.setHostName("host1"); + instanceConfig.setPort("1234"); + instanceConfig.setDomain("foo=bar"); + instanceConfig.setWeight(100); + instanceConfig.setInstanceEnabled(false); + instanceConfig.addTag("tag1"); + instanceConfig.addTag("tag2"); + instanceConfig.setInstanceCapacityMap(ImmutableMap.of("weight1", 1)); + + InstanceConfig overrideConfig = new InstanceConfig("instance1"); + overrideConfig.setHostName("host2"); + overrideConfig.setPort("5678"); + overrideConfig.setDomain("foo=bar2"); + overrideConfig.setWeight(200); + overrideConfig.addTag("tag3"); + overrideConfig.addTag("tag4"); + overrideConfig.setInstanceOperation(InstanceConstants.InstanceOperation.EVACUATE); + overrideConfig.setInstanceCapacityMap(ImmutableMap.of("weight2", 2)); + + instanceConfig.overwriteInstanceConfig(overrideConfig); + + Assert.assertEquals(instanceConfig.getId(), "instance2"); + Assert.assertEquals(instanceConfig.getHostName(), "host1"); + Assert.assertEquals(instanceConfig.getPort(), "1234"); + Assert.assertEquals(instanceConfig.getDomainAsString(), "foo=bar"); + Assert.assertEquals(instanceConfig.getWeight(), 200); + Assert.assertFalse(instanceConfig.getTags().contains("tag1")); + Assert.assertFalse(instanceConfig.getTags().contains("tag2")); + Assert.assertTrue(instanceConfig.getTags().contains("tag3")); + Assert.assertTrue(instanceConfig.getTags().contains("tag4")); + Assert.assertFalse(instanceConfig.getRecord().getSimpleFields() + .containsKey(InstanceConfig.InstanceConfigProperty.HELIX_ENABLED.toString())); + Assert.assertEquals(instanceConfig.getInstanceOperation(), + InstanceConstants.InstanceOperation.EVACUATE); + Assert.assertFalse(instanceConfig.getInstanceCapacityMap().containsKey("weight1")); + Assert.assertEquals(instanceConfig.getInstanceCapacityMap().get("weight2"), Integer.valueOf(2)); + } } diff --git a/helix-core/src/test/java/org/apache/helix/monitoring/mbeans/TestRebalancerMetrics.java b/helix-core/src/test/java/org/apache/helix/monitoring/mbeans/TestRebalancerMetrics.java index 57a3ad0bc8..b02a0f41d4 100644 --- a/helix-core/src/test/java/org/apache/helix/monitoring/mbeans/TestRebalancerMetrics.java +++ b/helix-core/src/test/java/org/apache/helix/monitoring/mbeans/TestRebalancerMetrics.java @@ -69,6 +69,7 @@ public void testRecoveryRebalanceMetrics() { event.addAttribute(AttributeName.RESOURCES.name(), resourceMap); event.addAttribute(AttributeName.RESOURCES_TO_REBALANCE.name(), resourceMap); event.addAttribute(AttributeName.CURRENT_STATE.name(), currentStateOutput); + event.addAttribute(AttributeName.CURRENT_STATE_EXCLUDING_UNKNOWN.name(), currentStateOutput); event.addAttribute(AttributeName.ControllerDataProvider.name(), new ResourceControllerDataProvider()); ClusterStatusMonitor monitor = new ClusterStatusMonitor(_clusterName); monitor.active(); @@ -119,6 +120,7 @@ public void testLoadBalanceMetrics() { event.addAttribute(AttributeName.RESOURCES.name(), resourceMap); event.addAttribute(AttributeName.RESOURCES_TO_REBALANCE.name(), resourceMap); event.addAttribute(AttributeName.CURRENT_STATE.name(), currentStateOutput); + event.addAttribute(AttributeName.CURRENT_STATE_EXCLUDING_UNKNOWN.name(), currentStateOutput); event.addAttribute(AttributeName.ControllerDataProvider.name(), new ResourceControllerDataProvider()); ClusterStatusMonitor monitor = new ClusterStatusMonitor(_clusterName); monitor.active(); @@ -131,6 +133,7 @@ public void testLoadBalanceMetrics() { event.getAttribute(AttributeName.BEST_POSSIBLE_STATE.name()); currentStateOutput = copyCurrentStateFromBestPossible(bestPossibleStateOutput, resource); event.addAttribute(AttributeName.CURRENT_STATE.name(), currentStateOutput); + event.addAttribute(AttributeName.CURRENT_STATE_EXCLUDING_UNKNOWN.name(), currentStateOutput); setupLiveInstances(4); diff --git a/helix-core/src/test/java/org/apache/helix/task/TestTargetedTaskStateChange.java b/helix-core/src/test/java/org/apache/helix/task/TestTargetedTaskStateChange.java index d8810b153d..24113c9bed 100644 --- a/helix-core/src/test/java/org/apache/helix/task/TestTargetedTaskStateChange.java +++ b/helix-core/src/test/java/org/apache/helix/task/TestTargetedTaskStateChange.java @@ -86,7 +86,7 @@ public void testTwoRunningCurrentStates() { when(mock._cache.getTaskDataCache()).thenReturn(mock._taskDataCache); when(mock._cache.getJobContext(JOB_NAME)).thenReturn(mock._jobContext); when(mock._cache.getIdealStates()).thenReturn(mock._idealStates); - when(mock._cache.getAssignableEnabledLiveInstances()).thenReturn(_liveInstances.keySet()); + when(mock._cache.getEnabledLiveInstances()).thenReturn(_liveInstances.keySet()); when(mock._cache.getAssignableInstanceConfigMap()).thenReturn(_instanceConfigs); when(mock._cache.getClusterConfig()).thenReturn(_clusterConfig); when(mock._taskDataCache.getRuntimeJobDag(WORKFLOW_NAME)).thenReturn(mock._runtimeJobDag); @@ -123,7 +123,7 @@ public void testOneRunningOneNull() { when(mock._cache.getTaskDataCache()).thenReturn(mock._taskDataCache); when(mock._cache.getJobContext(JOB_NAME)).thenReturn(mock._jobContext); when(mock._cache.getIdealStates()).thenReturn(mock._idealStates); - when(mock._cache.getAssignableEnabledLiveInstances()).thenReturn(_liveInstances.keySet()); + when(mock._cache.getEnabledLiveInstances()).thenReturn(_liveInstances.keySet()); when(mock._cache.getAssignableInstanceConfigMap()).thenReturn(_instanceConfigs); when(mock._cache.getClusterConfig()).thenReturn(_clusterConfig); when(mock._taskDataCache.getRuntimeJobDag(WORKFLOW_NAME)).thenReturn(mock._runtimeJobDag); diff --git a/helix-core/src/test/java/org/apache/helix/util/TestIdealStateAssignment.java b/helix-core/src/test/java/org/apache/helix/util/TestIdealStateAssignment.java index 6a7c8ba532..7d2a1b36fb 100644 --- a/helix-core/src/test/java/org/apache/helix/util/TestIdealStateAssignment.java +++ b/helix-core/src/test/java/org/apache/helix/util/TestIdealStateAssignment.java @@ -23,6 +23,7 @@ import java.util.List; import java.util.Map; +import org.apache.helix.constants.InstanceConstants; import org.apache.helix.model.ClusterConfig; import org.apache.helix.model.IdealState; import org.apache.helix.model.InstanceConfig; @@ -43,7 +44,8 @@ public void testIdealStateAssignment(String clusterName, List instances, for (String instance : instances) { instanceConfigs.add(new InstanceConfig(instance)); if (disabledInstances.contains(instance)) { - instanceConfigs.get(instanceConfigs.size() - 1).setInstanceEnabled(false); + instanceConfigs.get(instanceConfigs.size() - 1) + .setInstanceOperation(InstanceConstants.InstanceOperation.DISABLE); } } diff --git a/helix-core/src/test/java/org/apache/helix/util/TestInstanceValidationUtil.java b/helix-core/src/test/java/org/apache/helix/util/TestInstanceValidationUtil.java index 79b0fdce81..88dd053514 100644 --- a/helix-core/src/test/java/org/apache/helix/util/TestInstanceValidationUtil.java +++ b/helix-core/src/test/java/org/apache/helix/util/TestInstanceValidationUtil.java @@ -33,6 +33,7 @@ import org.apache.helix.HelixException; import org.apache.helix.PropertyKey; import org.apache.helix.PropertyType; +import org.apache.helix.constants.InstanceConstants; import org.apache.helix.model.ClusterConfig; import org.apache.helix.model.CurrentState; import org.apache.helix.model.ExternalView; @@ -77,7 +78,9 @@ public void TestIsInstanceEnabled(boolean instanceConfigEnabled, boolean cluster boolean expected) { Mock mock = new Mock(); InstanceConfig instanceConfig = new InstanceConfig(TEST_INSTANCE); - instanceConfig.setInstanceEnabled(instanceConfigEnabled); + instanceConfig.setInstanceOperation( + instanceConfigEnabled ? InstanceConstants.InstanceOperation.ENABLE + : InstanceConstants.InstanceOperation.DISABLE); doReturn(instanceConfig).when(mock.dataAccessor) .getProperty(BUILDER.instanceConfig(TEST_INSTANCE)); ClusterConfig clusterConfig = new ClusterConfig(TEST_CLUSTER); @@ -101,17 +104,6 @@ public void TestIsInstanceEnabled_whenInstanceConfigNull() { InstanceValidationUtil.isEnabled(mock.dataAccessor, TEST_INSTANCE); } - @Test(expectedExceptions = HelixException.class) - public void TestIsInstanceEnabled_whenClusterConfigNull() { - Mock mock = new Mock(); - doReturn(new InstanceConfig(TEST_INSTANCE)).when(mock.dataAccessor) - .getProperty(argThat(new PropertyKeyArgument(PropertyType.CONFIGS))); - doReturn(null).when(mock.dataAccessor) - .getProperty(BUILDER.clusterConfig()); - - InstanceValidationUtil.isEnabled(mock.dataAccessor, TEST_INSTANCE); - } - @Test public void TestIsInstanceAlive() { Mock mock = new Mock(); diff --git a/helix-core/src/test/resources/TestAbstractRebalancer.ComputeBestPossibleState.json b/helix-core/src/test/resources/TestAbstractRebalancer.ComputeBestPossibleState.json index 365994257d..30553e0fda 100644 --- a/helix-core/src/test/resources/TestAbstractRebalancer.ComputeBestPossibleState.json +++ b/helix-core/src/test/resources/TestAbstractRebalancer.ComputeBestPossibleState.json @@ -94,7 +94,8 @@ "node_3" ], "expectedBestPossibleStateMap": { - "node_1": "OFFLINE" + "node_1": "OFFLINE", + "node_3": "ERROR" } }, { diff --git a/helix-rest/src/main/java/org/apache/helix/rest/clusterMaintenanceService/StoppableInstancesSelector.java b/helix-rest/src/main/java/org/apache/helix/rest/clusterMaintenanceService/StoppableInstancesSelector.java index 877aaa9c89..8a4bbf07bd 100644 --- a/helix-rest/src/main/java/org/apache/helix/rest/clusterMaintenanceService/StoppableInstancesSelector.java +++ b/helix-rest/src/main/java/org/apache/helix/rest/clusterMaintenanceService/StoppableInstancesSelector.java @@ -268,7 +268,7 @@ private void collectEvacuatingInstances(Set toBeStoppedInstances) { PropertyKey.Builder propertyKeyBuilder = _dataAccessor.keyBuilder(); InstanceConfig instanceConfig = _dataAccessor.getProperty(propertyKeyBuilder.instanceConfig(instance)); - if (InstanceConstants.InstanceOperation.EVACUATE.name() + if (InstanceConstants.InstanceOperation.EVACUATE .equals(instanceConfig.getInstanceOperation())) { toBeStoppedInstances.add(instance); } diff --git a/helix-rest/src/main/java/org/apache/helix/rest/server/resources/helix/ResourceAssignmentOptimizerAccessor.java b/helix-rest/src/main/java/org/apache/helix/rest/server/resources/helix/ResourceAssignmentOptimizerAccessor.java index 03465a9cd8..714b53f450 100644 --- a/helix-rest/src/main/java/org/apache/helix/rest/server/resources/helix/ResourceAssignmentOptimizerAccessor.java +++ b/helix-rest/src/main/java/org/apache/helix/rest/server/resources/helix/ResourceAssignmentOptimizerAccessor.java @@ -44,6 +44,7 @@ import com.fasterxml.jackson.databind.ObjectMapper; import org.apache.helix.ConfigAccessor; import org.apache.helix.HelixDataAccessor; +import org.apache.helix.constants.InstanceConstants; import org.apache.helix.controller.rebalancer.strategy.AutoRebalanceStrategy; import org.apache.helix.controller.rebalancer.strategy.RebalanceStrategy; import org.apache.helix.controller.rebalancer.waged.WagedRebalancer; @@ -225,7 +226,8 @@ private ClusterState readClusterStateAndValidateInput(String clusterId, InputFie // Throw exception if there is no instanceConfig for activatedInstances instance. for (String instance : inputFields.activatedInstances) { if (instanceConfigMap.containsKey(instance)) { - instanceConfigMap.get(instance).setInstanceEnabled(true); + instanceConfigMap.get(instance) + .setInstanceOperation(InstanceConstants.InstanceOperation.ENABLE); } else { throw new InvalidParameterException( "instance: " + instance + "does not have instanceConfig"); @@ -234,7 +236,8 @@ private ClusterState readClusterStateAndValidateInput(String clusterId, InputFie for (String instance : inputFields.deactivatedInstances) { if (instanceConfigMap.containsKey(instance)) { - instanceConfigMap.get(instance).setInstanceEnabled(false); + instanceConfigMap.get(instance) + .setInstanceOperation(InstanceConstants.InstanceOperation.DISABLE); } } diff --git a/helix-rest/src/test/java/org/apache/helix/rest/server/AbstractTestClass.java b/helix-rest/src/test/java/org/apache/helix/rest/server/AbstractTestClass.java index d0f0c57151..c6ff0d6b02 100644 --- a/helix-rest/src/test/java/org/apache/helix/rest/server/AbstractTestClass.java +++ b/helix-rest/src/test/java/org/apache/helix/rest/server/AbstractTestClass.java @@ -45,6 +45,7 @@ import org.apache.helix.PropertyPathBuilder; import org.apache.helix.PropertyType; import org.apache.helix.TestHelper; +import org.apache.helix.constants.InstanceConstants; import org.apache.helix.integration.manager.ClusterControllerManager; import org.apache.helix.integration.manager.MockParticipantManager; import org.apache.helix.integration.task.MockTask; @@ -574,7 +575,7 @@ private void preSetupForParallelInstancesStoppableTest(String clusterName, instanceConfigs.add(new InstanceConfig(instances.get(instances.size() - 1))); instanceConfigs.get(instanceConfigs.size() - 1).setDomain("helixZoneId=zone2,host=instance5"); - instanceConfigs.get(1).setInstanceEnabled(false); + instanceConfigs.get(1).setInstanceOperation(InstanceConstants.InstanceOperation.DISABLE); instanceConfigs.get(3).setInstanceEnabledForPartition("FakeResource", "FakePartition", false); for (InstanceConfig instanceConfig : instanceConfigs) { diff --git a/helix-rest/src/test/java/org/apache/helix/rest/server/TestInstancesAccessor.java b/helix-rest/src/test/java/org/apache/helix/rest/server/TestInstancesAccessor.java index 93722f05af..0403083fb7 100644 --- a/helix-rest/src/test/java/org/apache/helix/rest/server/TestInstancesAccessor.java +++ b/helix-rest/src/test/java/org/apache/helix/rest/server/TestInstancesAccessor.java @@ -359,7 +359,7 @@ public void testInstancesStoppable_disableOneInstance() throws IOException { // Disable one selected instance0, it should failed to check String instance = "instance0"; InstanceConfig instanceConfig = _configAccessor.getInstanceConfig(STOPPABLE_CLUSTER, instance); - instanceConfig.setInstanceEnabled(false); + instanceConfig.setInstanceOperation(InstanceConstants.InstanceOperation.DISABLE); instanceConfig.setInstanceEnabledForPartition("FakeResource", "FakePartition", false); _configAccessor.setInstanceConfig(STOPPABLE_CLUSTER, instance, instanceConfig); @@ -377,7 +377,7 @@ public void testInstancesStoppable_disableOneInstance() throws IOException { ImmutableSet.of("HELIX:HAS_DISABLED_PARTITION","HELIX:INSTANCE_NOT_ENABLED","HELIX:INSTANCE_NOT_STABLE","HELIX:MIN_ACTIVE_REPLICA_CHECK_FAILED")); // Reenable instance0, it should passed the check - instanceConfig.setInstanceEnabled(true); + instanceConfig.setInstanceOperation(InstanceConstants.InstanceOperation.ENABLE); instanceConfig.setInstanceEnabledForPartition("FakeResource", "FakePartition", true); _configAccessor.setInstanceConfig(STOPPABLE_CLUSTER, instance, instanceConfig); Assert.assertTrue(verifier.verifyByPolling()); diff --git a/helix-rest/src/test/java/org/apache/helix/rest/server/TestPartitionAssignmentAPI.java b/helix-rest/src/test/java/org/apache/helix/rest/server/TestPartitionAssignmentAPI.java index 2c7a46b094..e00c392b0f 100644 --- a/helix-rest/src/test/java/org/apache/helix/rest/server/TestPartitionAssignmentAPI.java +++ b/helix-rest/src/test/java/org/apache/helix/rest/server/TestPartitionAssignmentAPI.java @@ -36,6 +36,7 @@ import org.apache.helix.ConfigAccessor; import org.apache.helix.HelixDataAccessor; import org.apache.helix.TestHelper; +import org.apache.helix.constants.InstanceConstants; import org.apache.helix.controller.rebalancer.DelayedAutoRebalancer; import org.apache.helix.controller.rebalancer.strategy.CrushEdRebalanceStrategy; import org.apache.helix.controller.rebalancer.waged.WagedRebalancer; @@ -107,7 +108,7 @@ public void beforeTest() { for (int i = 0; i < DEFAULT_INSTANCE_COUNT; i++) { String instanceName = INSTANCE_NAME_PREFIX + (INSTANCE_START_PORT + i); InstanceConfig instanceConfig = new InstanceConfig(instanceName); - instanceConfig.setInstanceEnabled(true); + instanceConfig.setInstanceOperation(InstanceConstants.InstanceOperation.ENABLE); instanceConfig.setInstanceCapacityMap( Collections.singletonMap(INSTANCE_CAPACITY_KEY, DEFAULT_INSTANCE_CAPACITY)); _gSetupTool.getClusterManagementTool().addInstance(CLUSTER_NAME, instanceConfig); diff --git a/helix-rest/src/test/java/org/apache/helix/rest/server/TestPerInstanceAccessor.java b/helix-rest/src/test/java/org/apache/helix/rest/server/TestPerInstanceAccessor.java index f9e48ca1f5..943444cad1 100644 --- a/helix-rest/src/test/java/org/apache/helix/rest/server/TestPerInstanceAccessor.java +++ b/helix-rest/src/test/java/org/apache/helix/rest/server/TestPerInstanceAccessor.java @@ -36,7 +36,6 @@ import com.fasterxml.jackson.databind.node.ArrayNode; import com.google.common.collect.ImmutableList; import com.google.common.collect.ImmutableMap; -import org.apache.helix.ConfigAccessor; import org.apache.helix.HelixDataAccessor; import org.apache.helix.HelixException; import org.apache.helix.TestHelper; @@ -50,9 +49,6 @@ import org.apache.helix.rest.server.resources.helix.InstancesAccessor; import org.apache.helix.rest.server.resources.helix.PerInstanceAccessor; import org.apache.helix.rest.server.util.JerseyUriRequestBuilder; -import org.apache.helix.tools.ClusterStateVerifier; -import org.apache.helix.tools.ClusterVerifiers.StrictMatchExternalViewVerifier; -import org.apache.helix.tools.ClusterVerifiers.ZkHelixClusterVerifier; import org.apache.helix.zookeeper.datamodel.ZNRecord; import org.testng.Assert; import org.testng.annotations.Test; @@ -500,15 +496,15 @@ public void updateInstance() throws IOException { new JerseyUriRequestBuilder("clusters/{}/instances/{}?command=setInstanceOperation&instanceOperation=EVACUATE") .format(CLUSTER_NAME, INSTANCE_NAME).post(this, entity); instanceConfig = _configAccessor.getInstanceConfig(CLUSTER_NAME, INSTANCE_NAME); - Assert.assertEquals( - instanceConfig.getInstanceOperation(), InstanceConstants.InstanceOperation.EVACUATE.toString()); + Assert.assertEquals(instanceConfig.getInstanceOperation(), + InstanceConstants.InstanceOperation.EVACUATE); new JerseyUriRequestBuilder("clusters/{}/instances/{}?command=setInstanceOperation&instanceOperation=INVALIDOP") .expectedReturnStatusCode(Response.Status.NOT_FOUND.getStatusCode()).format(CLUSTER_NAME, INSTANCE_NAME).post(this, entity); new JerseyUriRequestBuilder("clusters/{}/instances/{}?command=setInstanceOperation&instanceOperation=") .format(CLUSTER_NAME, INSTANCE_NAME).post(this, entity); instanceConfig = _configAccessor.getInstanceConfig(CLUSTER_NAME, INSTANCE_NAME); - Assert.assertEquals( - instanceConfig.getInstanceOperation(), ""); + Assert.assertEquals(instanceConfig.getInstanceOperation(), + InstanceConstants.InstanceOperation.ENABLE); // test canCompleteSwap Response canCompleteSwapResponse = @@ -548,8 +544,8 @@ public void updateInstance() throws IOException { new JerseyUriRequestBuilder("clusters/{}/instances/{}?command=setInstanceOperation&instanceOperation=EVACUATE") .format(CLUSTER_NAME, INSTANCE_NAME).post(this, entity); instanceConfig = _configAccessor.getInstanceConfig(CLUSTER_NAME, INSTANCE_NAME); - Assert.assertEquals( - instanceConfig.getInstanceOperation(), InstanceConstants.InstanceOperation.EVACUATE.toString()); + Assert.assertEquals(instanceConfig.getInstanceOperation(), + InstanceConstants.InstanceOperation.EVACUATE); Response response = new JerseyUriRequestBuilder("clusters/{}/instances/{}?command=isEvacuateFinished") .format(CLUSTER_NAME, INSTANCE_NAME).post(this, entity); @@ -591,8 +587,8 @@ public void updateInstance() throws IOException { new JerseyUriRequestBuilder("clusters/{}/instances/{}?command=setInstanceOperation&instanceOperation=EVACUATE") .format(CLUSTER_NAME, test_instance_name).post(this, entity); instanceConfig = _configAccessor.getInstanceConfig(CLUSTER_NAME, test_instance_name); - Assert.assertEquals( - instanceConfig.getInstanceOperation(), InstanceConstants.InstanceOperation.EVACUATE.toString()); + Assert.assertEquals(instanceConfig.getInstanceOperation(), + InstanceConstants.InstanceOperation.EVACUATE); response = new JerseyUriRequestBuilder("clusters/{}/instances/{}?command=isEvacuateFinished") .format(CLUSTER_NAME, test_instance_name).post(this, entity); diff --git a/helix-rest/src/test/java/org/apache/helix/rest/server/TestResourceAssignmentOptimizerAccessor.java b/helix-rest/src/test/java/org/apache/helix/rest/server/TestResourceAssignmentOptimizerAccessor.java index 899256619f..7d49318f02 100644 --- a/helix-rest/src/test/java/org/apache/helix/rest/server/TestResourceAssignmentOptimizerAccessor.java +++ b/helix-rest/src/test/java/org/apache/helix/rest/server/TestResourceAssignmentOptimizerAccessor.java @@ -34,6 +34,7 @@ import com.fasterxml.jackson.core.type.TypeReference; import org.apache.helix.HelixDataAccessor; import org.apache.helix.TestHelper; +import org.apache.helix.constants.InstanceConstants; import org.apache.helix.manager.zk.ZKHelixDataAccessor; import org.apache.helix.model.IdealState; import org.apache.helix.model.InstanceConfig; @@ -68,7 +69,7 @@ public void beforeClass() { toEnabledInstance = liveInstances.get(2); InstanceConfig config = _gSetupTool.getClusterManagementTool() .getInstanceConfig(cluster, toEnabledInstance); - config.setInstanceEnabled(false); + config.setInstanceOperation(InstanceConstants.InstanceOperation.DISABLE); _gSetupTool.getClusterManagementTool() .setInstanceConfig(cluster, toEnabledInstance, config); @@ -94,7 +95,7 @@ public void afterClass() { } InstanceConfig config = _gSetupTool.getClusterManagementTool() .getInstanceConfig(cluster, toEnabledInstance); - config.setInstanceEnabled(true); + config.setInstanceOperation(InstanceConstants.InstanceOperation.DISABLE); _gSetupTool.getClusterManagementTool().setInstanceConfig(cluster, toEnabledInstance, config); _gSetupTool.getClusterManagementTool() .enableMaintenanceMode(cluster, false, TestHelper.getTestMethodName()); @@ -245,8 +246,8 @@ public void testComputePartitionAssignmentWaged() throws IOException { InstanceConfig toEnabledInstanceConfig = _gSetupTool.getClusterManagementTool().getInstanceConfig(cluster, toEnabledInstance); // Another way to mark the node as inactive or active. - toDeactivatedInstanceConfig.setInstanceEnabled(false); - toEnabledInstanceConfig.setInstanceEnabled(true); + toDeactivatedInstanceConfig.setInstanceOperation(InstanceConstants.InstanceOperation.DISABLE); + toEnabledInstanceConfig.setInstanceOperation(InstanceConstants.InstanceOperation.ENABLE); // Write the current InstanceConfigs record to json string StringWriter sw = new StringWriter(); OBJECT_MAPPER.writeValue(sw, toDeactivatedInstanceConfig.getRecord()); diff --git a/recipes/rabbitmq-consumer-group/src/main/java/org/apache/helix/recipes/rabbitmq/Consumer.java b/recipes/rabbitmq-consumer-group/src/main/java/org/apache/helix/recipes/rabbitmq/Consumer.java index 998f1175a0..eddd68b387 100644 --- a/recipes/rabbitmq-consumer-group/src/main/java/org/apache/helix/recipes/rabbitmq/Consumer.java +++ b/recipes/rabbitmq-consumer-group/src/main/java/org/apache/helix/recipes/rabbitmq/Consumer.java @@ -24,6 +24,7 @@ import org.apache.helix.HelixManager; import org.apache.helix.HelixManagerFactory; import org.apache.helix.InstanceType; +import org.apache.helix.constants.InstanceConstants; import org.apache.helix.manager.zk.ZKHelixAdmin; import org.apache.helix.manager.zk.ZNRecordSerializer; import org.apache.helix.zookeeper.impl.client.ZkClient; @@ -98,7 +99,7 @@ public static void main(String[] args) throws Exception { if (!nodes.contains("consumer_" + consumerId)) { InstanceConfig config = new InstanceConfig("consumer_" + consumerId); config.setHostName("localhost"); - config.setInstanceEnabled(true); + config.setInstanceOperation(InstanceConstants.InstanceOperation.ENABLE); admin.addInstance(clusterName, config); } diff --git a/recipes/rsync-replicated-file-system/src/main/java/org/apache/helix/filestore/SetupCluster.java b/recipes/rsync-replicated-file-system/src/main/java/org/apache/helix/filestore/SetupCluster.java index 9dbcbfe048..5b8e736a97 100644 --- a/recipes/rsync-replicated-file-system/src/main/java/org/apache/helix/filestore/SetupCluster.java +++ b/recipes/rsync-replicated-file-system/src/main/java/org/apache/helix/filestore/SetupCluster.java @@ -19,6 +19,7 @@ * under the License. */ +import org.apache.helix.constants.InstanceConstants; import org.apache.helix.manager.zk.ZKHelixAdmin; import org.apache.helix.manager.zk.ZNRecordSerializer; import org.apache.helix.zookeeper.impl.client.ZkClient; @@ -65,7 +66,7 @@ public static void main(String[] args) { InstanceConfig config = new InstanceConfig(serverId); config.setHostName("localhost"); config.setPort(port); - config.setInstanceEnabled(true); + config.setInstanceOperation(InstanceConstants.InstanceOperation.ENABLE); admin.addInstance(clusterName, config); } // add resource "repository" which has 1 partition diff --git a/recipes/task-execution/src/main/java/org/apache/helix/taskexecution/Worker.java b/recipes/task-execution/src/main/java/org/apache/helix/taskexecution/Worker.java index 11d43953b4..3704c5406d 100644 --- a/recipes/task-execution/src/main/java/org/apache/helix/taskexecution/Worker.java +++ b/recipes/task-execution/src/main/java/org/apache/helix/taskexecution/Worker.java @@ -24,6 +24,7 @@ import org.apache.helix.HelixManager; import org.apache.helix.HelixManagerFactory; import org.apache.helix.InstanceType; +import org.apache.helix.constants.InstanceConstants; import org.apache.helix.manager.zk.ZKHelixAdmin; import org.apache.helix.manager.zk.ZNRecordSerializer; import org.apache.helix.zookeeper.impl.client.ZkClient; @@ -98,7 +99,7 @@ public void run() { if (!nodes.contains(_instanceName)) { InstanceConfig config = new InstanceConfig(_instanceName); config.setHostName("localhost"); - config.setInstanceEnabled(true); + config.setInstanceOperation(InstanceConstants.InstanceOperation.ENABLE); admin.addInstance(_clusterName, config); } From 1bfed3138a93c7e4a4bcc48349ce11b249167926 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Grant=20Pal=C3=A1u=20Spencer?= Date: Tue, 7 May 2024 11:57:28 -0700 Subject: [PATCH 03/11] Prevent MetaClient LeaderElectionClient isLeader NPE before joining pool (#2798) Prevent MetaClient LeaderElectionClient isLeader NPE before joining pool --- .../leaderelection/LeaderElectionClient.java | 3 ++- .../recipes/leaderelection/TestLeaderElection.java | 14 ++++++++++++++ 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/meta-client/src/main/java/org/apache/helix/metaclient/recipes/leaderelection/LeaderElectionClient.java b/meta-client/src/main/java/org/apache/helix/metaclient/recipes/leaderelection/LeaderElectionClient.java index 3bcf09ceb3..373d360132 100644 --- a/meta-client/src/main/java/org/apache/helix/metaclient/recipes/leaderelection/LeaderElectionClient.java +++ b/meta-client/src/main/java/org/apache/helix/metaclient/recipes/leaderelection/LeaderElectionClient.java @@ -121,7 +121,8 @@ public LeaderElectionClient(MetaClientInterface metaClient, String p * Returns true if current participant is the current leadership. */ public boolean isLeader(String leaderPath) { - return getLeader(leaderPath).equalsIgnoreCase(_participant); + String leader = getLeader(leaderPath); + return leader != null && leader.equalsIgnoreCase(_participant); } /** diff --git a/meta-client/src/test/java/org/apache/helix/metaclient/recipes/leaderelection/TestLeaderElection.java b/meta-client/src/test/java/org/apache/helix/metaclient/recipes/leaderelection/TestLeaderElection.java index 75917623c7..248643652a 100644 --- a/meta-client/src/test/java/org/apache/helix/metaclient/recipes/leaderelection/TestLeaderElection.java +++ b/meta-client/src/test/java/org/apache/helix/metaclient/recipes/leaderelection/TestLeaderElection.java @@ -41,7 +41,21 @@ public void cleanUp() { } } + // Test that calling isLeader before client joins LeaderElectionParticipantPool returns false and does not throw NPE @Test + public void testIsLeaderBeforeJoiningParticipantPool() throws Exception { + String leaderPath = LEADER_PATH + "/testIsLeaderBeforeJoiningPool"; + LeaderElectionClient clt1 = createLeaderElectionClient(PARTICIPANT_NAME1); + try { + boolean isLeader = clt1.isLeader(leaderPath); + Assert.assertFalse(isLeader, "Expected isLeader to return false before joining participant pool"); + } catch (NullPointerException npe) { + Assert.fail("isLeader threw NPE before joining participant pool: " + npe.getMessage()); + } + clt1.close(); + } + + @Test (dependsOnMethods = "testIsLeaderBeforeJoiningParticipantPool") public void testAcquireLeadership() throws Exception { System.out.println("START TestLeaderElection.testAcquireLeadership"); String leaderPath = LEADER_PATH + "/testAcquireLeadership"; From 1d47d6b51ee27764008685cf6ec76d4108915807 Mon Sep 17 00:00:00 2001 From: Charanya Sudharsanan Date: Tue, 7 May 2024 21:44:40 -0700 Subject: [PATCH 04/11] [apache/helix] -- Add SetPartitionToError for participants to self annotate a node to ERROR state (#2792) Co-authored-by: Charanya Sudharsanan What: An API endpoint that validates the incoming request and sends a state transition message to sets one or more partitions from any current state to ERROR state. Why: Currently, the participants are unable to set a partition to an ERROR state explicitly when they seem to be stuck in a specific current state. The only way a replica can be set to ERROR is from within a state model. Having an endpoint to allow this behavior would allow the clients to call the resetPartition endpoint to set it back to INIT state and recover the replica. resetPartition works only on partitions in error state. --- .../java/org/apache/helix/HelixAdmin.java | 12 + .../apache/helix/manager/zk/ZKHelixAdmin.java | 256 ++++++++++-------- .../handling/HelixStateTransitionHandler.java | 3 +- .../helix/messaging/handling/HelixTask.java | 1 + .../participant/statemachine/StateModel.java | 11 + .../org/apache/helix/tools/ClusterSetup.java | 21 ++ .../TestSetPartitionsToErrorState.java | 99 +++++++ .../helix/manager/zk/TestZkHelixAdmin.java | 119 +++++++- .../org/apache/helix/mock/MockHelixAdmin.java | 6 + .../server/resources/AbstractResource.java | 3 +- .../resources/helix/PerInstanceAccessor.java | 10 + .../rest/server/TestPerInstanceAccessor.java | 43 ++- 12 files changed, 458 insertions(+), 126 deletions(-) create mode 100644 helix-core/src/test/java/org/apache/helix/integration/TestSetPartitionsToErrorState.java diff --git a/helix-core/src/main/java/org/apache/helix/HelixAdmin.java b/helix-core/src/main/java/org/apache/helix/HelixAdmin.java index d2e0c2681a..84a7154b18 100644 --- a/helix-core/src/main/java/org/apache/helix/HelixAdmin.java +++ b/helix-core/src/main/java/org/apache/helix/HelixAdmin.java @@ -421,6 +421,18 @@ void manuallyEnableMaintenanceMode(String clusterName, boolean enabled, String r */ ClusterManagementMode getClusterManagementMode(String clusterName); + /** + * Set a list of partitions for an instance to ERROR state from any state. + * The partitions could be in any state and setPartitionsToError will bring them to ERROR + * state. ANY to ERROR state transition is required for this. + * @param clusterName + * @param instanceName + * @param resourceName + * @param partitionNames + */ + void setPartitionsToError(String clusterName, String instanceName, String resourceName, + List partitionNames); + /** * Reset a list of partitions in error state for an instance * The partitions are assume to be in error state and reset will bring them from error diff --git a/helix-core/src/main/java/org/apache/helix/manager/zk/ZKHelixAdmin.java b/helix-core/src/main/java/org/apache/helix/manager/zk/ZKHelixAdmin.java index c7fe0861ba..8c873b4cdb 100644 --- a/helix-core/src/main/java/org/apache/helix/manager/zk/ZKHelixAdmin.java +++ b/helix-core/src/main/java/org/apache/helix/manager/zk/ZKHelixAdmin.java @@ -1035,6 +1035,136 @@ public ClusterManagementMode getClusterManagementMode(String clusterName) { : new ClusterManagementMode(status.getManagementMode(), status.getManagementModeStatus()); } + @Override + public void setPartitionsToError(String clusterName, String instanceName, String resourceName, + List partitionNames) { + logger.info("Set partitions {} for resource {} on instance {} in cluster {} to ERROR state.", + partitionNames == null ? "NULL" : HelixUtil.serializeByComma(partitionNames), resourceName, + instanceName, clusterName); + sendStateTransitionMessage(clusterName, instanceName, resourceName, partitionNames, + StateTransitionType.SET_TO_ERROR); + } + + private void sendStateTransitionMessage(String clusterName, String instanceName, + String resourceName, List partitionNames, StateTransitionType stateTransitionType) { + HelixDataAccessor accessor = + new ZKHelixDataAccessor(clusterName, new ZkBaseDataAccessor(_zkClient)); + PropertyKey.Builder keyBuilder = accessor.keyBuilder(); + + // check the instance is alive + LiveInstance liveInstance = accessor.getProperty(keyBuilder.liveInstance(instanceName)); + if (liveInstance == null) { + // check if the instance exists in the cluster + String instanceConfigPath = PropertyPathBuilder.instanceConfig(clusterName, instanceName); + throw new HelixException(String.format( + (_zkClient.exists(instanceConfigPath) ? SetPartitionFailureReason.INSTANCE_NOT_ALIVE + : SetPartitionFailureReason.INSTANCE_NON_EXISTENT).getMessage(resourceName, + partitionNames, instanceName, instanceName, clusterName, stateTransitionType))); + } + + // check resource exists in ideal state + IdealState idealState = accessor.getProperty(keyBuilder.idealStates(resourceName)); + if (idealState == null) { + throw new HelixException( + String.format(SetPartitionFailureReason.RESOURCE_NON_EXISTENT.getMessage(resourceName, + partitionNames, instanceName, resourceName, clusterName, stateTransitionType))); + } + + // check partition exists in resource + Set partitionsNames = new HashSet(partitionNames); + Set partitions = (idealState.getRebalanceMode() == RebalanceMode.CUSTOMIZED) + ? idealState.getRecord().getMapFields().keySet() + : idealState.getRecord().getListFields().keySet(); + if (!partitions.containsAll(partitionsNames)) { + throw new HelixException( + String.format(SetPartitionFailureReason.PARTITION_NON_EXISTENT.getMessage(resourceName, + partitionNames, instanceName, partitionNames.toString(), clusterName, stateTransitionType))); + } + + // check partition is in ERROR state if reset is set to True + String sessionId = liveInstance.getEphemeralOwner(); + CurrentState curState = + accessor.getProperty(keyBuilder.currentState(instanceName, sessionId, resourceName)); + if (stateTransitionType.equals(StateTransitionType.RESET)) { + for (String partitionName : partitionNames) { + if (!curState.getState(partitionName).equals(HelixDefinedState.ERROR.toString())) { + throw new HelixException(String.format( + SetPartitionFailureReason.PARTITION_NOT_ERROR.getMessage(resourceName, partitionNames, + instanceName, partitionNames.toString(), clusterName, stateTransitionType))); + } + } + } + + // check stateModelDef exists + String stateModelDef = idealState.getStateModelDefRef(); + StateModelDefinition stateModel = accessor.getProperty(keyBuilder.stateModelDef(stateModelDef)); + if (stateModel == null) { + throw new HelixException( + String.format(SetPartitionFailureReason.STATE_MODEL_NON_EXISTENT.getMessage(resourceName, + partitionNames, instanceName, stateModelDef, clusterName, stateTransitionType))); + } + + // check there is no pending messages for the partitions exist + List messages = accessor.getChildValues(keyBuilder.messages(instanceName), true); + for (Message message : messages) { + if (!MessageType.STATE_TRANSITION.name().equalsIgnoreCase(message.getMsgType()) + || !sessionId.equals(message.getTgtSessionId()) + || !resourceName.equals(message.getResourceName()) + || !partitionsNames.contains(message.getPartitionName())) { + continue; + } + + throw new HelixException(String.format( + "Can't %s state for %s.%s on %s, because a pending message %s exists for resource %s", + stateTransitionType.name(), resourceName, partitionNames, instanceName, message, + message.getResourceName())); + } + + String adminName = null; + try { + adminName = InetAddress.getLocalHost().getCanonicalHostName() + "-ADMIN"; + } catch (UnknownHostException e) { + logger.info("Unable to get host name. Will set it to UNKNOWN, mostly ignorable", e); + adminName = "UNKNOWN"; + } + + List stateTransitionMessages = new ArrayList(); + List messageKeys = new ArrayList(); + for (String partitionName : partitionNames) { + String msgId = UUID.randomUUID().toString(); + Message message = new Message(MessageType.STATE_TRANSITION, msgId); + message.setSrcName(adminName); + message.setTgtName(instanceName); + message.setMsgState(MessageState.NEW); + message.setPartitionName(partitionName); + message.setResourceName(resourceName); + message.setTgtSessionId(sessionId); + message.setStateModelDef(stateModelDef); + message.setStateModelFactoryName(idealState.getStateModelFactoryName()); + // if reset == TRUE, send ERROR to initialState message + // else, send * to ERROR state message + if (stateTransitionType.equals(StateTransitionType.RESET)) { + message.setFromState(HelixDefinedState.ERROR.toString()); + message.setToState(stateModel.getInitialState()); + } + if (stateTransitionType.equals(StateTransitionType.SET_TO_ERROR)) { + message.setFromState("*"); + message.setToState(HelixDefinedState.ERROR.toString()); + } + if (idealState.getResourceGroupName() != null) { + message.setResourceGroupName(idealState.getResourceGroupName()); + } + if (idealState.getInstanceGroupTag() != null) { + message.setResourceTag(idealState.getInstanceGroupTag()); + } + + stateTransitionMessages.add(message); + messageKeys.add(keyBuilder.message(instanceName, message.getId())); + } + + accessor.setChildren(messageKeys, stateTransitionMessages); + } + private void enableClusterPauseMode(String clusterName, boolean cancelPendingST, String reason) { String hostname = NetworkUtil.getLocalhostName(); logger.info( @@ -1180,7 +1310,7 @@ private void processMaintenanceMode(String clusterName, final boolean enabled, } } - private enum ResetPartitionFailureReason { + private enum SetPartitionFailureReason { INSTANCE_NOT_ALIVE("%s is not alive in cluster %s"), INSTANCE_NON_EXISTENT("%s does not exist in cluster %s"), RESOURCE_NON_EXISTENT("resource %s is not added to cluster %s"), @@ -1190,129 +1320,33 @@ private enum ResetPartitionFailureReason { private String message; - ResetPartitionFailureReason(String message) { + SetPartitionFailureReason(String message) { this.message = message; } public String getMessage(String resourceName, List partitionNames, String instanceName, - String errorStateEntity, String clusterName) { - return String.format("Can't reset state for %s.%s on %s, because " + message, resourceName, - partitionNames, instanceName, errorStateEntity, clusterName); + String errorStateEntity, String clusterName, StateTransitionType stateTransitionType) { + return String.format("Can't %s state for %s.%s on %s, because " + message, + stateTransitionType.name(), resourceName, partitionNames, instanceName, errorStateEntity, + clusterName); } } + private enum StateTransitionType { + // sets state from ERROR to INIT. + RESET, + // sets state from ANY to ERROR. + SET_TO_ERROR, + // Unknown StateTransitionType + UNDEFINED + } @Override public void resetPartition(String clusterName, String instanceName, String resourceName, List partitionNames) { logger.info("Reset partitions {} for resource {} on instance {} in cluster {}.", partitionNames == null ? "NULL" : HelixUtil.serializeByComma(partitionNames), resourceName, instanceName, clusterName); - HelixDataAccessor accessor = - new ZKHelixDataAccessor(clusterName, new ZkBaseDataAccessor(_zkClient)); - PropertyKey.Builder keyBuilder = accessor.keyBuilder(); - - // check the instance is alive - LiveInstance liveInstance = accessor.getProperty(keyBuilder.liveInstance(instanceName)); - if (liveInstance == null) { - // check if the instance exists in the cluster - String instanceConfigPath = PropertyPathBuilder.instanceConfig(clusterName, instanceName); - throw new HelixException(String.format( - (_zkClient.exists(instanceConfigPath) ? ResetPartitionFailureReason.INSTANCE_NOT_ALIVE - : ResetPartitionFailureReason.INSTANCE_NON_EXISTENT) - .getMessage(resourceName, partitionNames, instanceName, instanceName, clusterName))); - } - - // check resource group exists - IdealState idealState = accessor.getProperty(keyBuilder.idealStates(resourceName)); - if (idealState == null) { - throw new HelixException(String.format(ResetPartitionFailureReason.RESOURCE_NON_EXISTENT - .getMessage(resourceName, partitionNames, instanceName, resourceName, clusterName))); - } - - // check partition exists in resource group - Set resetPartitionNames = new HashSet(partitionNames); - Set partitions = - (idealState.getRebalanceMode() == RebalanceMode.CUSTOMIZED) ? idealState.getRecord() - .getMapFields().keySet() : idealState.getRecord().getListFields().keySet(); - if (!partitions.containsAll(resetPartitionNames)) { - throw new HelixException(String.format(ResetPartitionFailureReason.PARTITION_NON_EXISTENT - .getMessage(resourceName, partitionNames, instanceName, partitionNames.toString(), - clusterName))); - } - - // check partition is in ERROR state - String sessionId = liveInstance.getEphemeralOwner(); - CurrentState curState = - accessor.getProperty(keyBuilder.currentState(instanceName, sessionId, resourceName)); - for (String partitionName : resetPartitionNames) { - if (!curState.getState(partitionName).equals(HelixDefinedState.ERROR.toString())) { - throw new HelixException(String.format(ResetPartitionFailureReason.PARTITION_NOT_ERROR - .getMessage(resourceName, partitionNames, instanceName, partitionNames.toString(), - clusterName))); - } - } - - // check stateModelDef exists and get initial state - String stateModelDef = idealState.getStateModelDefRef(); - StateModelDefinition stateModel = accessor.getProperty(keyBuilder.stateModelDef(stateModelDef)); - if (stateModel == null) { - throw new HelixException(String.format(ResetPartitionFailureReason.STATE_MODEL_NON_EXISTENT - .getMessage(resourceName, partitionNames, instanceName, stateModelDef, clusterName))); - } - - // check there is no pending messages for the partitions exist - List messages = accessor.getChildValues(keyBuilder.messages(instanceName), true); - for (Message message : messages) { - if (!MessageType.STATE_TRANSITION.name().equalsIgnoreCase(message.getMsgType()) || !sessionId - .equals(message.getTgtSessionId()) || !resourceName.equals(message.getResourceName()) - || !resetPartitionNames.contains(message.getPartitionName())) { - continue; - } - - throw new HelixException(String.format( - "Can't reset state for %s.%s on %s, because a pending message %s exists for resource %s", - resourceName, partitionNames, instanceName, message.toString(), - message.getResourceName())); - } - - String adminName = null; - try { - adminName = InetAddress.getLocalHost().getCanonicalHostName() + "-ADMIN"; - } catch (UnknownHostException e) { - // can ignore it - logger.info("Unable to get host name. Will set it to UNKNOWN, mostly ignorable", e); - adminName = "UNKNOWN"; - } - - List resetMessages = new ArrayList(); - List messageKeys = new ArrayList(); - for (String partitionName : resetPartitionNames) { - // send ERROR to initialState message - String msgId = UUID.randomUUID().toString(); - Message message = new Message(MessageType.STATE_TRANSITION, msgId); - message.setSrcName(adminName); - message.setTgtName(instanceName); - message.setMsgState(MessageState.NEW); - message.setPartitionName(partitionName); - message.setResourceName(resourceName); - message.setTgtSessionId(sessionId); - message.setStateModelDef(stateModelDef); - message.setFromState(HelixDefinedState.ERROR.toString()); - message.setToState(stateModel.getInitialState()); - message.setStateModelFactoryName(idealState.getStateModelFactoryName()); - - if (idealState.getResourceGroupName() != null) { - message.setResourceGroupName(idealState.getResourceGroupName()); - } - if (idealState.getInstanceGroupTag() != null) { - message.setResourceTag(idealState.getInstanceGroupTag()); - } - - resetMessages.add(message); - messageKeys.add(keyBuilder.message(instanceName, message.getId())); - } - - accessor.setChildren(messageKeys, resetMessages); + sendStateTransitionMessage(clusterName, instanceName, resourceName, partitionNames, StateTransitionType.RESET); } @Override diff --git a/helix-core/src/main/java/org/apache/helix/messaging/handling/HelixStateTransitionHandler.java b/helix-core/src/main/java/org/apache/helix/messaging/handling/HelixStateTransitionHandler.java index 0d67ced4b3..0a91370b07 100644 --- a/helix-core/src/main/java/org/apache/helix/messaging/handling/HelixStateTransitionHandler.java +++ b/helix-core/src/main/java/org/apache/helix/messaging/handling/HelixStateTransitionHandler.java @@ -176,7 +176,8 @@ void postHandleMessage() { deltaList.add(delta); _currentStateDelta.setDeltaList(deltaList); _stateModelFactory.removeStateModel(_message.getResourceName(), partitionKey); - } else if (_stateModel.getCurrentState().equals(_message.getFromState())) { + } else if (_message.getFromState().equals("*") + || _stateModel.getCurrentState().equals(_message.getFromState())) { // if the partition is not to be dropped, update _stateModel to the TO_STATE // need this check because TaskRunner may change _stateModel before reach here. _stateModel.updateState(toState); diff --git a/helix-core/src/main/java/org/apache/helix/messaging/handling/HelixTask.java b/helix-core/src/main/java/org/apache/helix/messaging/handling/HelixTask.java index 4470b99964..6a9473ebaa 100644 --- a/helix-core/src/main/java/org/apache/helix/messaging/handling/HelixTask.java +++ b/helix-core/src/main/java/org/apache/helix/messaging/handling/HelixTask.java @@ -323,6 +323,7 @@ private void reportMessageStat(HelixManager manager, Message message, HelixTaskR String fromState = message.getFromState(); String toState = message.getToState(); String transition = fromState + "--" + toState; + transition = transition.replaceAll("\\*", "ANY"); StateTransitionContext cxt = new StateTransitionContext(manager.getClusterName(), manager.getInstanceName(), diff --git a/helix-core/src/main/java/org/apache/helix/participant/statemachine/StateModel.java b/helix-core/src/main/java/org/apache/helix/participant/statemachine/StateModel.java index 143c14adef..5bb2a19c86 100644 --- a/helix-core/src/main/java/org/apache/helix/participant/statemachine/StateModel.java +++ b/helix-core/src/main/java/org/apache/helix/participant/statemachine/StateModel.java @@ -115,4 +115,15 @@ public void cancel() { public boolean isCancelled() { return _cancelled; } + + /* + * default transition to set partition in any state to error state + * @param message + * @param context + * @throws InterruptedException + */ + @Transition(to = "ERROR", from = "*") + public void onBecomeErrorFromAny(Message message, NotificationContext context) throws Exception { + logger.info("Default *->ERROR transition invoked."); + } } diff --git a/helix-core/src/main/java/org/apache/helix/tools/ClusterSetup.java b/helix-core/src/main/java/org/apache/helix/tools/ClusterSetup.java index 633ce0341f..a9578632b0 100644 --- a/helix-core/src/main/java/org/apache/helix/tools/ClusterSetup.java +++ b/helix-core/src/main/java/org/apache/helix/tools/ClusterSetup.java @@ -134,6 +134,9 @@ public class ClusterSetup { public static final String resetInstance = "resetInstance"; public static final String resetResource = "resetResource"; + // set partitions to ERROR + public static final String setPartitionsToError = "setPartitionsToError"; + // help public static final String help = "help"; @@ -1114,6 +1117,13 @@ private static Options constructCommandLineOptions() { removeCloudConfigOption.setRequired(false); removeCloudConfigOption.setArgName("clusterName"); + Option setPartitionsToErrorOption = + OptionBuilder.withLongOpt(setPartitionsToError) + .withDescription("Set a Partition to Error State").create(); + setPartitionsToErrorOption.setArgs(4); + setPartitionsToErrorOption.setRequired(false); + setPartitionsToErrorOption.setArgName("clusterName instanceName resourceName partitionName"); + OptionGroup group = new OptionGroup(); group.setRequired(true); group.addOption(rebalanceOption); @@ -1153,6 +1163,7 @@ private static Options constructCommandLineOptions() { group.addOption(listStateModelOption); group.addOption(addResourcePropertyOption); group.addOption(removeResourcePropertyOption); + group.addOption(setPartitionsToErrorOption); // set/get/remove config options group.addOption(setConfOption); @@ -1561,6 +1572,16 @@ public static int processCommandLineArgs(String[] cliArgs) throws Exception { String newInstanceName = cmd.getOptionValues(swapInstance)[2]; setupTool.swapInstance(clusterName, oldInstanceName, newInstanceName); + } else if (cmd.hasOption(setPartitionsToError)) { + String[] args = cmd.getOptionValues(setPartitionsToError); + + String clusterName = args[0]; + String instanceName = args[1]; + String resourceName = args[2]; + List partitionNames = Arrays.asList(Arrays.copyOfRange(args, 3, args.length)); + + setupTool.getClusterManagementTool().setPartitionsToError(clusterName, instanceName, resourceName, partitionNames); + return 0; } // set/get/remove config options else if (cmd.hasOption(setConfig)) { diff --git a/helix-core/src/test/java/org/apache/helix/integration/TestSetPartitionsToErrorState.java b/helix-core/src/test/java/org/apache/helix/integration/TestSetPartitionsToErrorState.java new file mode 100644 index 0000000000..5b13703b6f --- /dev/null +++ b/helix-core/src/test/java/org/apache/helix/integration/TestSetPartitionsToErrorState.java @@ -0,0 +1,99 @@ +package org.apache.helix.integration; + +import java.util.Date; +import java.util.HashMap; +import java.util.Map; +import org.apache.helix.TestHelper; +import org.apache.helix.common.ZkTestBase; +import org.apache.helix.integration.manager.ClusterControllerManager; +import org.apache.helix.integration.manager.MockParticipantManager; +import org.apache.helix.tools.ClusterSetup; +import org.apache.helix.tools.ClusterStateVerifier; +import org.testng.Assert; +import org.testng.annotations.Test; + +public class TestSetPartitionsToErrorState extends ZkTestBase { + + @Test() + public void testSetPartitionsToErrorState() throws Exception { + String className = TestHelper.getTestClassName(); + String methodName = TestHelper.getTestMethodName(); + String clusterName = className + "_" + methodName; + final int n = 5; + + System.out.println("START " + clusterName + " at " + new Date(System.currentTimeMillis())); + + TestHelper.setupCluster(clusterName, ZK_ADDR, 12918, // participant port + "localhost", // participant name prefix + "TestDB", // resource name prefix + 1, // resources + 10, // partitions per resource + n, // number of nodes + 3, // replicas + "MasterSlave", true); // do rebalance + + ClusterControllerManager controller = + new ClusterControllerManager(ZK_ADDR, clusterName, "controller_0"); + controller.syncStart(); + + // start mock participants + MockParticipantManager[] participants = new MockParticipantManager[n]; + for (int i = 0; i < n; i++) { + String instanceName = "localhost_" + (12918 + i); + participants[i] = new MockParticipantManager(ZK_ADDR, clusterName, instanceName); + participants[i].syncStart(); + } + + // verify cluster + HashMap> errStateMap = new HashMap<>(); + errStateMap.put("TestDB0", new HashMap<>()); + boolean result = ClusterStateVerifier.verifyByZkCallback( + (new ClusterStateVerifier.BestPossAndExtViewZkVerifier(ZK_ADDR, clusterName, errStateMap))); + Assert.assertTrue(result, "Cluster verification fails"); + + // set a non exist partition to ERROR, should throw exception + try { + String command = "--zkSvr " + ZK_ADDR + " --setPartitionsToError " + clusterName + + " localhost_12918 TestDB0 TestDB0_nonExist"; + ClusterSetup.processCommandLineArgs(command.split("\\s+")); + Assert.fail("Should throw exception on setting a non-exist partition to error"); + } catch (Exception e) { + // OK + } + + // set one partition not in ERROR state to ERROR + String command = "--zkSvr " + ZK_ADDR + " --setPartitionsToError " + clusterName + + " localhost_12918 TestDB0 TestDB0_4"; + ClusterSetup.processCommandLineArgs(command.split("\\s+")); + errStateMap.get("TestDB0").put("TestDB0_4", "localhost_12918"); + result = ClusterStateVerifier.verifyByZkCallback( + (new ClusterStateVerifier.BestPossAndExtViewZkVerifier(ZK_ADDR, clusterName, errStateMap))); + Assert.assertTrue(result, "Cluster verification fails"); + + // set another partition not in ERROR state to ERROR + command = "--zkSvr " + ZK_ADDR + " --setPartitionsToError " + clusterName + + " localhost_12918 TestDB0 TestDB0_7"; + ClusterSetup.processCommandLineArgs(command.split("\\s+")); + errStateMap.get("TestDB0").put("TestDB0_7", "localhost_12918"); + result = ClusterStateVerifier.verifyByZkCallback( + (new ClusterStateVerifier.BestPossAndExtViewZkVerifier(ZK_ADDR, clusterName, errStateMap))); + Assert.assertTrue(result, "Cluster verification fails"); + + // setting a partition already in ERROR state to ERROR - message does not get processed + command = "--zkSvr " + ZK_ADDR + " --setPartitionsToError " + clusterName + + " localhost_12918 TestDB0 TestDB0_7"; + ClusterSetup.processCommandLineArgs(command.split("\\s+")); + result = ClusterStateVerifier.verifyByZkCallback( + (new ClusterStateVerifier.BestPossAndExtViewZkVerifier(ZK_ADDR, clusterName, errStateMap))); + Assert.assertTrue(result, "Cluster verification fails"); + + // clean up + controller.syncStop(); + for (int i = 0; i < 5; i++) { + participants[i].syncStop(); + } + deleteCluster(clusterName); + + System.out.println("END " + clusterName + " at " + new Date(System.currentTimeMillis())); + } +} diff --git a/helix-core/src/test/java/org/apache/helix/manager/zk/TestZkHelixAdmin.java b/helix-core/src/test/java/org/apache/helix/manager/zk/TestZkHelixAdmin.java index 59decd98e5..5581108578 100644 --- a/helix-core/src/test/java/org/apache/helix/manager/zk/TestZkHelixAdmin.java +++ b/helix-core/src/test/java/org/apache/helix/manager/zk/TestZkHelixAdmin.java @@ -589,6 +589,117 @@ public void testLegacyEnableDisablePartition() { 2); } + @Test(description = "Unit test for sanity check in setPartitionsToError()") + public void testSetPartitionsToError() throws Exception { + String className = TestHelper.getTestClassName(); + String methodName = TestHelper.getTestMethodName(); + String clusterName = className + "_" + methodName; + String instanceName = "TestInstance"; + String testResource = "TestResource"; + String wrongTestInstance = "WrongTestInstance"; + String wrongTestResource = "WrongTestResource"; + System.out.println("START " + clusterName + " at " + new Date(System.currentTimeMillis())); + HelixAdmin admin = new ZKHelixAdmin(_gZkClient); + admin.addCluster(clusterName, true); + admin.addInstance(clusterName, new InstanceConfig(instanceName)); + admin.enableInstance(clusterName, instanceName, true); + InstanceConfig instanceConfig = admin.getInstanceConfig(clusterName, instanceName); + + IdealState idealState = new IdealState(testResource); + idealState.setNumPartitions(3); + admin.addStateModelDef(clusterName, "MasterSlave", new MasterSlaveSMD()); + idealState.setStateModelDefRef("MasterSlave"); + idealState.setRebalanceMode(IdealState.RebalanceMode.FULL_AUTO); + admin.addResource(clusterName, testResource, idealState); + admin.enableResource(clusterName, testResource, true); + + /* + * This is a unit test for sanity check in setPartitionsToError(). + * There is no running controller in this test. We have end-to-end tests for + * setPartitionsToError() + * under integration/TestSetPartitionsToError. + */ + // setPartitionsToError is expected to throw an exception when provided with a nonexistent + // instance. + try { + admin.setPartitionsToError(clusterName, wrongTestInstance, testResource, + Arrays.asList("1", "2")); + Assert.fail("Should throw HelixException"); + } catch (HelixException expected) { + // This exception is expected because the instance name is made up. + Assert.assertEquals(expected.getMessage(), String.format( + "Can't SET_TO_ERROR state for %s.[1, 2] on WrongTestInstance, because %s does not exist in cluster %s", + testResource, wrongTestInstance, clusterName)); + } + + // setPartitionsToError is expected to throw an exception when provided with a non-live + // instance. + try { + admin.setPartitionsToError(clusterName, instanceName, testResource, Arrays.asList("1", "2")); + Assert.fail("Should throw HelixException"); + } catch (HelixException expected) { + // This exception is expected because the instance is not alive. + Assert.assertEquals(expected.getMessage(), + String.format( + "Can't SET_TO_ERROR state for %s.[1, 2] on %s, because %s is not alive in cluster %s", + testResource, instanceName, instanceName, clusterName)); + } + + HelixManager manager = initializeHelixManager(clusterName, instanceConfig.getInstanceName()); + manager.connect(); + + // setPartitionsToError is expected to throw an exception when provided with a nonexistent + // resource. + try { + admin.setPartitionsToError(clusterName, instanceName, wrongTestResource, + Arrays.asList("1", "2")); + Assert.fail("Should throw HelixException"); + } catch (HelixException expected) { + // This exception is expected because the resource is not added. + Assert.assertEquals(expected.getMessage(), String.format( + "Can't SET_TO_ERROR state for %s.[1, 2] on %s, because resource %s is not added to cluster %s", + wrongTestResource, instanceName, wrongTestResource, clusterName)); + } + + // setPartitionsToError is expected to throw an exception when partition does not exist. + try { + admin.setPartitionsToError(clusterName, instanceName, testResource, Arrays.asList("1", "2")); + Assert.fail("Should throw HelixException"); + } catch (HelixException expected) { + // This exception is expected because partitions do not exist. + Assert.assertEquals(expected.getMessage(), String.format( + "Can't SET_TO_ERROR state for %s.[1, 2] on %s, because not all [1, 2] exist in cluster %s", + testResource, instanceName, clusterName)); + } + + // clean up + manager.disconnect(); + admin.dropCluster(clusterName); + + // verify the cluster has been removed successfully + HelixDataAccessor dataAccessor = + new ZKHelixDataAccessor(className, new ZkBaseDataAccessor<>(_gZkClient)); + try { + Assert.assertTrue(TestHelper.verify( + () -> dataAccessor.getChildNames(dataAccessor.keyBuilder().liveInstances()).isEmpty(), + 1000)); + } catch (Exception e) { + e.printStackTrace(); + System.out.println("There're live instances not cleaned up yet"); + assert false; + } + + try { + Assert.assertTrue(TestHelper.verify( + () -> dataAccessor.getChildNames(dataAccessor.keyBuilder().clusterConfig()).isEmpty(), + 1000)); + } catch (Exception e) { + e.printStackTrace(); + System.out.println("The cluster is not cleaned up yet"); + assert false; + } + } + @Test public void testResetPartition() throws Exception { String className = TestHelper.getTestClassName(); @@ -625,7 +736,7 @@ public void testResetPartition() throws Exception { } catch (HelixException expected) { // This exception is expected because the instance name is made up. Assert.assertEquals(expected.getMessage(), String.format( - "Can't reset state for %s.[1, 2] on WrongTestInstance, because %s does not exist in cluster %s", + "Can't RESET state for %s.[1, 2] on WrongTestInstance, because %s does not exist in cluster %s", testResource, wrongTestInstance, clusterName)); } @@ -636,7 +747,7 @@ public void testResetPartition() throws Exception { } catch (HelixException expected) { // This exception is expected because the instance is not alive. Assert.assertEquals(expected.getMessage(), String - .format("Can't reset state for %s.[1, 2] on %s, because %s is not alive in cluster %s", + .format("Can't RESET state for %s.[1, 2] on %s, because %s is not alive in cluster %s", testResource, instanceName, instanceName, clusterName)); } @@ -650,7 +761,7 @@ public void testResetPartition() throws Exception { } catch (HelixException expected) { // This exception is expected because the resource is not added. Assert.assertEquals(expected.getMessage(), String.format( - "Can't reset state for %s.[1, 2] on %s, because resource %s is not added to cluster %s", + "Can't RESET state for %s.[1, 2] on %s, because resource %s is not added to cluster %s", wrongTestResource, instanceName, wrongTestResource, clusterName)); } @@ -660,7 +771,7 @@ public void testResetPartition() throws Exception { } catch (HelixException expected) { // This exception is expected because partitions do not exist. Assert.assertEquals(expected.getMessage(), String.format( - "Can't reset state for %s.[1, 2] on %s, because not all [1, 2] exist in cluster %s", + "Can't RESET state for %s.[1, 2] on %s, because not all [1, 2] exist in cluster %s", testResource, instanceName, clusterName)); } diff --git a/helix-core/src/test/java/org/apache/helix/mock/MockHelixAdmin.java b/helix-core/src/test/java/org/apache/helix/mock/MockHelixAdmin.java index 9a1311b1c3..d9bc5d7fe6 100644 --- a/helix-core/src/test/java/org/apache/helix/mock/MockHelixAdmin.java +++ b/helix-core/src/test/java/org/apache/helix/mock/MockHelixAdmin.java @@ -364,6 +364,12 @@ public ClusterManagementMode getClusterManagementMode(String clusterName) { return null; } + @Override + public void setPartitionsToError(String clusterName, String instanceName, String resourceName, + List partitionNames) { + + } + @Override public void resetPartition(String clusterName, String instanceName, String resourceName, List partitionNames) { diff --git a/helix-rest/src/main/java/org/apache/helix/rest/server/resources/AbstractResource.java b/helix-rest/src/main/java/org/apache/helix/rest/server/resources/AbstractResource.java index ce3d27273e..fdad634afd 100644 --- a/helix-rest/src/main/java/org/apache/helix/rest/server/resources/AbstractResource.java +++ b/helix-rest/src/main/java/org/apache/helix/rest/server/resources/AbstractResource.java @@ -89,7 +89,8 @@ public enum Command { canCompleteSwap, completeSwapIfPossible, onDemandRebalance, - isEvacuateFinished + isEvacuateFinished, + setPartitionsToError } @Context diff --git a/helix-rest/src/main/java/org/apache/helix/rest/server/resources/helix/PerInstanceAccessor.java b/helix-rest/src/main/java/org/apache/helix/rest/server/resources/helix/PerInstanceAccessor.java index efeeee7f7e..ea98f66371 100644 --- a/helix-rest/src/main/java/org/apache/helix/rest/server/resources/helix/PerInstanceAccessor.java +++ b/helix-rest/src/main/java/org/apache/helix/rest/server/resources/helix/PerInstanceAccessor.java @@ -434,6 +434,16 @@ public Response updateInstance(@PathParam("clusterId") String clusterId, OBJECT_MAPPER.getTypeFactory() .constructCollectionType(List.class, String.class))); break; + case setPartitionsToError: + if (!validInstance(node, instanceName)) { + return badRequest("Instance names are not a match!"); + } + admin.setPartitionsToError(clusterId, instanceName, + node.get(PerInstanceProperties.resource.name()).textValue(), + (List) OBJECT_MAPPER.readValue( + node.get(PerInstanceProperties.partitions.name()).toString(), OBJECT_MAPPER + .getTypeFactory().constructCollectionType(List.class, String.class))); + break; case setInstanceOperation: admin.setInstanceOperation(clusterId, instanceName, state); break; diff --git a/helix-rest/src/test/java/org/apache/helix/rest/server/TestPerInstanceAccessor.java b/helix-rest/src/test/java/org/apache/helix/rest/server/TestPerInstanceAccessor.java index 943444cad1..395f9bf858 100644 --- a/helix-rest/src/test/java/org/apache/helix/rest/server/TestPerInstanceAccessor.java +++ b/helix-rest/src/test/java/org/apache/helix/rest/server/TestPerInstanceAccessor.java @@ -37,11 +37,13 @@ import com.google.common.collect.ImmutableList; import com.google.common.collect.ImmutableMap; import org.apache.helix.HelixDataAccessor; +import org.apache.helix.HelixDefinedState; import org.apache.helix.HelixException; import org.apache.helix.TestHelper; import org.apache.helix.constants.InstanceConstants; import org.apache.helix.manager.zk.ZKHelixDataAccessor; import org.apache.helix.model.ClusterConfig; +import org.apache.helix.model.ExternalView; import org.apache.helix.model.IdealState; import org.apache.helix.model.InstanceConfig; import org.apache.helix.model.Message; @@ -377,7 +379,7 @@ public void testDeleteInstance() { } @Test(dependsOnMethods = "testDeleteInstance") - public void updateInstance() throws IOException { + public void updateInstance() throws Exception { System.out.println("Start test :" + TestHelper.getTestMethodName()); // Disable instance Entity entity = Entity.entity("", MediaType.APPLICATION_JSON_TYPE); @@ -461,11 +463,11 @@ public void updateInstance() throws IOException { String dbName = "_db_0_"; List partitionsToDisable = Arrays.asList(CLUSTER_NAME + dbName + "0", CLUSTER_NAME + dbName + "1", CLUSTER_NAME + dbName + "3"); + String RESOURCE_NAME = CLUSTER_NAME + dbName.substring(0, dbName.length() - 1); entity = Entity.entity( OBJECT_MAPPER.writeValueAsString(ImmutableMap.of(AbstractResource.Properties.id.name(), - INSTANCE_NAME, PerInstanceAccessor.PerInstanceProperties.resource.name(), - CLUSTER_NAME + dbName.substring(0, dbName.length() - 1), + INSTANCE_NAME, PerInstanceAccessor.PerInstanceProperties.resource.name(), RESOURCE_NAME, PerInstanceAccessor.PerInstanceProperties.partitions.name(), partitionsToDisable)), MediaType.APPLICATION_JSON_TYPE); @@ -474,13 +476,11 @@ public void updateInstance() throws IOException { InstanceConfig instanceConfig = _configAccessor.getInstanceConfig(CLUSTER_NAME, INSTANCE_NAME); Assert.assertEquals( - new HashSet<>(instanceConfig.getDisabledPartitionsMap() - .get(CLUSTER_NAME + dbName.substring(0, dbName.length() - 1))), + new HashSet<>(instanceConfig.getDisabledPartitionsMap().get(RESOURCE_NAME)), new HashSet<>(partitionsToDisable)); entity = Entity.entity(OBJECT_MAPPER.writeValueAsString(ImmutableMap .of(AbstractResource.Properties.id.name(), INSTANCE_NAME, - PerInstanceAccessor.PerInstanceProperties.resource.name(), - CLUSTER_NAME + dbName.substring(0, dbName.length() - 1), + PerInstanceAccessor.PerInstanceProperties.resource.name(), RESOURCE_NAME, PerInstanceAccessor.PerInstanceProperties.partitions.name(), ImmutableList.of(CLUSTER_NAME + dbName + "1"))), MediaType.APPLICATION_JSON_TYPE); @@ -488,8 +488,7 @@ public void updateInstance() throws IOException { .format(CLUSTER_NAME, INSTANCE_NAME).post(this, entity); instanceConfig = _configAccessor.getInstanceConfig(CLUSTER_NAME, INSTANCE_NAME); - Assert.assertEquals(new HashSet<>(instanceConfig.getDisabledPartitionsMap() - .get(CLUSTER_NAME + dbName.substring(0, dbName.length() - 1))), + Assert.assertEquals(new HashSet<>(instanceConfig.getDisabledPartitionsMap().get(RESOURCE_NAME)), new HashSet<>(Arrays.asList(CLUSTER_NAME + dbName + "0", CLUSTER_NAME + dbName + "3"))); // test set instance operation @@ -595,6 +594,32 @@ public void updateInstance() throws IOException { evacuateFinishedResult = OBJECT_MAPPER.readValue(response.readEntity(String.class), Map.class); Assert.assertEquals(response.getStatus(), Response.Status.OK.getStatusCode()); Assert.assertTrue(evacuateFinishedResult.get("successful")); + + // test setPartitionsToError + List partitionsToSetToError = Arrays.asList(CLUSTER_NAME + dbName + "7"); + + entity = Entity.entity( + OBJECT_MAPPER.writeValueAsString(ImmutableMap.of(AbstractResource.Properties.id.name(), + INSTANCE_NAME, PerInstanceAccessor.PerInstanceProperties.resource.name(), RESOURCE_NAME, + PerInstanceAccessor.PerInstanceProperties.partitions.name(), partitionsToSetToError)), + MediaType.APPLICATION_JSON_TYPE); + + response = new JerseyUriRequestBuilder("clusters/{}/instances/{}?command=setPartitionsToError") + .format(CLUSTER_NAME, INSTANCE_NAME).post(this, entity); + + Assert.assertEquals(response.getStatus(), Response.Status.OK.getStatusCode()); + + TestHelper.verify(() -> { + ExternalView externalView = _gSetupTool.getClusterManagementTool() + .getResourceExternalView(CLUSTER_NAME, RESOURCE_NAME); + Set responseForAllPartitions = new HashSet(); + for (String partition : partitionsToSetToError) { + responseForAllPartitions.add(externalView.getStateMap(partition) + .get(INSTANCE_NAME) == HelixDefinedState.ERROR.toString()); + } + return !responseForAllPartitions.contains(Boolean.FALSE); + }, TestHelper.WAIT_DURATION); + System.out.println("End test :" + TestHelper.getTestMethodName()); } From ca6ad6bc35ad9a83607392e6b141722d854be7e5 Mon Sep 17 00:00:00 2001 From: Junkai Xue Date: Wed, 8 May 2024 15:42:32 -0700 Subject: [PATCH 05/11] Disable the unstable test for task framework --- .../integration/task/TestTaskSchedulingTwoCurrentStates.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/helix-core/src/test/java/org/apache/helix/integration/task/TestTaskSchedulingTwoCurrentStates.java b/helix-core/src/test/java/org/apache/helix/integration/task/TestTaskSchedulingTwoCurrentStates.java index ada5157a3b..4181c4b822 100644 --- a/helix-core/src/test/java/org/apache/helix/integration/task/TestTaskSchedulingTwoCurrentStates.java +++ b/helix-core/src/test/java/org/apache/helix/integration/task/TestTaskSchedulingTwoCurrentStates.java @@ -96,7 +96,7 @@ public void afterClass() throws Exception { super.afterClass(); } - @Test + @Test (enabled = false) public void testTargetedTaskTwoCurrentStates() throws Exception { _gSetupTool.addResourceToCluster(CLUSTER_NAME, DATABASE, _numPartitions, MASTER_SLAVE_STATE_MODEL, IdealState.RebalanceMode.SEMI_AUTO.name()); From 93c59a763c1f4ec72aeae8c654292b15d0f50ed5 Mon Sep 17 00:00:00 2001 From: Himanshu Kandwal Date: Wed, 8 May 2024 18:25:45 -0700 Subject: [PATCH 06/11] [apache/helix] -- Provide JDK 1.8 (backward) compatibility for meta-client (#2799) We would like to provide a backward compatible support to our consumers where they also have an option to use JDK-8 compiled helix-core and meta-client jar, if they have such a requirement. By default we will generate JDK-11 jars and JDK-8 jars using a classifier. --- helix-core/pom.xml | 1 - meta-client/pom.xml | 18 ++++++++++++++++++ 2 files changed, 18 insertions(+), 1 deletion(-) diff --git a/helix-core/pom.xml b/helix-core/pom.xml index ba3d7b7e51..c8c5a504d2 100644 --- a/helix-core/pom.xml +++ b/helix-core/pom.xml @@ -219,7 +219,6 @@ package jar - test-jar ${project.build.outputDirectory}_jdk8 diff --git a/meta-client/pom.xml b/meta-client/pom.xml index 29092ef1e8..e08872c8c2 100644 --- a/meta-client/pom.xml +++ b/meta-client/pom.xml @@ -89,6 +89,24 @@ under the License. + + org.apache.maven.plugins + maven-jar-plugin + 3.3.0 + + + default-package-jdk11 + package + + jar + + + ${project.build.outputDirectory}_jdk8 + jdk8 + + + + org.apache.maven.plugins maven-assembly-plugin From c11cddf916fa7809ea99a10d97aa18dc8bc90a87 Mon Sep 17 00:00:00 2001 From: Himanshu Kandwal Date: Thu, 9 May 2024 13:44:49 -0700 Subject: [PATCH 07/11] [apache/helix] -- [Part-2] Enable JDK 1.8 (backward) compatibility for meta-client (#2802) We would like to provide a backward compatible support to our consumers where they also have an option to use JDK-8 compiled meta-client jar, if they have such a requirement. By default we will generate JDK-11 jars and JDK-8 jars using a classifier. --- meta-client/pom.xml | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/meta-client/pom.xml b/meta-client/pom.xml index e08872c8c2..a4762eb371 100644 --- a/meta-client/pom.xml +++ b/meta-client/pom.xml @@ -89,6 +89,36 @@ under the License. + + org.apache.maven.plugins + maven-compiler-plugin + 3.12.1 + + + JDK 8 + compile + + compile + + + ${project.build.outputDirectory}_jdk8 + 8 + true + + + + JDK 11 + compile + + compile + + + 11 + true + + + + org.apache.maven.plugins maven-jar-plugin From e0e30a7bea7a7d838b9311ff34c7aed5eaa79fff Mon Sep 17 00:00:00 2001 From: Charanya Sudharsanan Date: Tue, 4 Jun 2024 15:37:16 -0700 Subject: [PATCH 08/11] [apache/helix] --> Update logic for metric calculation when replica is set to ANY_LIVEINSTANCE (#2804) This PR updates logic for metric calculation when replica is set to ANY_LIVEINSTANCE to avoid NFE. --- .../monitoring/mbeans/ResourceMonitor.java | 14 +++---------- .../mbeans/TestResourceMonitor.java | 21 +++++++++++++++++++ 2 files changed, 24 insertions(+), 11 deletions(-) diff --git a/helix-core/src/main/java/org/apache/helix/monitoring/mbeans/ResourceMonitor.java b/helix-core/src/main/java/org/apache/helix/monitoring/mbeans/ResourceMonitor.java index 5064c64812..012513ffcd 100644 --- a/helix-core/src/main/java/org/apache/helix/monitoring/mbeans/ResourceMonitor.java +++ b/helix-core/src/main/java/org/apache/helix/monitoring/mbeans/ResourceMonitor.java @@ -272,17 +272,9 @@ public void updateResourceState(ExternalView externalView, IdealState idealState long numOfPartitionWithTopState = 0; Set partitions = idealState.getPartitionSet(); - int replica; - try { - replica = Integer.valueOf(idealState.getReplicas()); - } catch (NumberFormatException e) { - _logger.info("Unspecified replica count for {}, skip updating the ResourceMonitor Mbean: {}", _resourceName, - idealState.getReplicas()); - return; - } catch (Exception ex) { - _logger.warn("Failed to get replica count for {}, cannot update the ResourceMonitor Mbean.", _resourceName); - return; - } + + // returns -1 when replica is set to ANY_LIVEINSTANCE. + int replica = idealState.getReplicaCount(-1); int minActiveReplica = idealState.getMinActiveReplicas(); minActiveReplica = (minActiveReplica >= 0) ? minActiveReplica : replica; diff --git a/helix-core/src/test/java/org/apache/helix/monitoring/mbeans/TestResourceMonitor.java b/helix-core/src/test/java/org/apache/helix/monitoring/mbeans/TestResourceMonitor.java index 0fb0e09371..355cad4501 100644 --- a/helix-core/src/test/java/org/apache/helix/monitoring/mbeans/TestResourceMonitor.java +++ b/helix-core/src/test/java/org/apache/helix/monitoring/mbeans/TestResourceMonitor.java @@ -37,6 +37,7 @@ import com.google.common.collect.ImmutableMap; import org.apache.helix.TestHelper; +import org.apache.helix.model.ResourceConfig; import org.apache.helix.zookeeper.datamodel.ZNRecord; import org.apache.helix.model.BuiltInStateModelDefinitions; import org.apache.helix.model.ExternalView; @@ -220,6 +221,26 @@ public void testReportData() throws JMException { monitor.setRebalanceState(ResourceMonitor.RebalanceStatus.INTERMEDIATE_STATE_CAL_FAILED); Assert.assertEquals(monitor.getRebalanceState(), ResourceMonitor.RebalanceStatus.INTERMEDIATE_STATE_CAL_FAILED.name()); + + // test when replica is set to ANY_LIVEINSTANCE and all instances are taken offline. + idealState.setReplicas(ResourceConfig.ResourceConfigConstants.ANY_LIVEINSTANCE.name()); + + for (int i = 0; i < _partitions; i++) { + String partition = _dbName + "_" + i; + Map externalViewStateMap = externalView.getStateMap(partition); + for (String key : externalViewStateMap.keySet()) { + if (externalViewStateMap.get(key).equalsIgnoreCase("MASTER")) { + externalViewStateMap.put(key, "OFFLINE"); + } + } + externalView.setStateMap(partition, externalViewStateMap); + } + + monitor.updateResourceState(externalView, idealState, stateModelDef); + + Assert.assertEquals(monitor.getMissingTopStatePartitionGauge(), _partitions); + Assert.assertEquals(monitor.getMissingReplicaPartitionGauge(), 0); + Assert.assertEquals(monitor.getMissingMinActiveReplicaPartitionGauge(), 0); } finally { // Has to unregister this monitor to clean up. Otherwise, later tests may be affected and fail. monitor.unregister(); From 17f2df6f6e2ee453668e4cd8a562a93434d531d8 Mon Sep 17 00:00:00 2001 From: Himanshu Kandwal Date: Wed, 5 Jun 2024 09:50:17 -0700 Subject: [PATCH 09/11] [apache/helix] -- Enable JDK 1.8 (backward) compatibility for dependent modules of meta-client and helix-core (#2806) We would like to provide a backward compatible support to our consumers where they also have an option to use JDK-8 compiled meta-client jar, if they have such a requirement. By default we will generate JDK-11 jars and JDK-8 jars using a classifier. In this PR, we are enabling the JDK* build of full sub-tree of the helix-core and meta-client modules. This includes: helix-common, zookeeper-api, metadata-store-directory-common, metrics-common --- helix-common/pom.xml | 48 +++++++++++++++++++ .../helix/constants/InstanceConstants.java | 8 ++-- metadata-store-directory-common/pom.xml | 48 +++++++++++++++++++ metrics-common/pom.xml | 48 +++++++++++++++++++ zookeeper-api/pom.xml | 48 +++++++++++++++++++ 5 files changed, 197 insertions(+), 3 deletions(-) diff --git a/helix-common/pom.xml b/helix-common/pom.xml index 40f64043b6..82e35c86dd 100644 --- a/helix-common/pom.xml +++ b/helix-common/pom.xml @@ -89,6 +89,54 @@ + + org.apache.maven.plugins + maven-compiler-plugin + 3.12.1 + + + JDK 8 + compile + + compile + + + ${project.build.outputDirectory}_jdk8 + 8 + true + + + + JDK 11 + compile + + compile + + + 11 + true + + + + + + org.apache.maven.plugins + maven-jar-plugin + 3.3.0 + + + default-package-jdk11 + package + + jar + + + ${project.build.outputDirectory}_jdk8 + jdk8 + + + + org.apache.maven.plugins maven-assembly-plugin diff --git a/helix-common/src/main/java/org/apache/helix/constants/InstanceConstants.java b/helix-common/src/main/java/org/apache/helix/constants/InstanceConstants.java index 07eb4989d3..85c22c460b 100644 --- a/helix-common/src/main/java/org/apache/helix/constants/InstanceConstants.java +++ b/helix-common/src/main/java/org/apache/helix/constants/InstanceConstants.java @@ -2,6 +2,8 @@ import java.util.Set; +import com.google.common.collect.ImmutableSet; + public class InstanceConstants { public static final String INSTANCE_NOT_DISABLED = "INSTANCE_NOT_DISABLED"; @@ -9,7 +11,7 @@ public class InstanceConstants { * The set contains the InstanceOperations that are allowed to be assigned replicas by the rebalancer. */ public static final Set ASSIGNABLE_INSTANCE_OPERATIONS = - Set.of(InstanceOperation.ENABLE, InstanceOperation.DISABLE); + ImmutableSet.of(InstanceOperation.ENABLE, InstanceOperation.DISABLE); /** @@ -18,14 +20,14 @@ public class InstanceConstants { * TODO: Remove this when the deprecated HELIX_ENABLED is removed. */ public static final Set INSTANCE_DISABLED_OVERRIDABLE_OPERATIONS = - Set.of(InstanceOperation.ENABLE, InstanceOperation.DISABLE, InstanceOperation.EVACUATE); + ImmutableSet.of(InstanceOperation.ENABLE, InstanceOperation.DISABLE, InstanceOperation.EVACUATE); /** * The set of InstanceOperations that are not allowed to be populated in the RoutingTableProvider. */ public static final Set UNSERVABLE_INSTANCE_OPERATIONS = - Set.of(InstanceOperation.SWAP_IN, InstanceOperation.UNKNOWN); + ImmutableSet.of(InstanceOperation.SWAP_IN, InstanceOperation.UNKNOWN); public enum InstanceDisabledType { CLOUD_EVENT, diff --git a/metadata-store-directory-common/pom.xml b/metadata-store-directory-common/pom.xml index f173397cc4..98e2e5cb03 100644 --- a/metadata-store-directory-common/pom.xml +++ b/metadata-store-directory-common/pom.xml @@ -113,6 +113,54 @@ + + org.apache.maven.plugins + maven-compiler-plugin + 3.12.1 + + + JDK 8 + compile + + compile + + + ${project.build.outputDirectory}_jdk8 + 8 + true + + + + JDK 11 + compile + + compile + + + 11 + true + + + + + + org.apache.maven.plugins + maven-jar-plugin + 3.3.0 + + + default-package-jdk11 + package + + jar + + + ${project.build.outputDirectory}_jdk8 + jdk8 + + + + org.apache.maven.plugins maven-assembly-plugin diff --git a/metrics-common/pom.xml b/metrics-common/pom.xml index 2dbe016cb2..433c575aac 100644 --- a/metrics-common/pom.xml +++ b/metrics-common/pom.xml @@ -84,6 +84,54 @@ + + org.apache.maven.plugins + maven-compiler-plugin + 3.12.1 + + + JDK 8 + compile + + compile + + + ${project.build.outputDirectory}_jdk8 + 8 + true + + + + JDK 11 + compile + + compile + + + 11 + true + + + + + + org.apache.maven.plugins + maven-jar-plugin + 3.3.0 + + + default-package-jdk11 + package + + jar + + + ${project.build.outputDirectory}_jdk8 + jdk8 + + + + org.apache.maven.plugins maven-assembly-plugin diff --git a/zookeeper-api/pom.xml b/zookeeper-api/pom.xml index d44160fdb7..bfb993feec 100644 --- a/zookeeper-api/pom.xml +++ b/zookeeper-api/pom.xml @@ -133,6 +133,54 @@ + + org.apache.maven.plugins + maven-compiler-plugin + 3.12.1 + + + JDK 8 + compile + + compile + + + ${project.build.outputDirectory}_jdk8 + 8 + true + + + + JDK 11 + compile + + compile + + + 11 + true + + + + + + org.apache.maven.plugins + maven-jar-plugin + 3.3.0 + + + default-package-jdk11 + package + + jar + + + ${project.build.outputDirectory}_jdk8 + jdk8 + + + + org.apache.maven.plugins maven-assembly-plugin From b7d771e7032dd542ab8b466f28d239c3e425f5de Mon Sep 17 00:00:00 2001 From: Zachary Pinto Date: Tue, 11 Jun 2024 11:17:07 -0700 Subject: [PATCH 10/11] Deprecate HELIX_DISABLED_REASON and refactor how InstanceOperation is represented in instance configs. (#2801) Deprecate HELIX_DISABLED_REASON and HELIX_DISABLED_TYPE; Refactor INSTANCE_OPERATION to HELIX_INSTANCE_OPERATIONS List Field To prevent conflicts from different clients setting the InstanceOperation, we are introducing the HELIX_INSTANCE_OPERATIONS list. Key changes: - Clients using the old Helix enabled APIs will take precedence over INSTANCE_DISABLED_OVERRIDABLE_OPERATIONS when those fields are set. - When the new InstanceOperation APIs set the operation type to DISABLE, the old HELIX_ENABLED field will also be set for backwards compatibility. - For all InstanceOperation API invocations, the source will default to USER unless specified otherwise. An AUTOMATION source will create a separate entry in the list. - The most recent non-ENABLE InstanceOperation entry will be the active InstanceOperation used by the controller and returned by the getInstanceOperation API. These changes ensure smoother operation transitions and maintain compatibility with existing APIs. --- .../helix/constants/InstanceConstants.java | 37 +- .../java/org/apache/helix/HelixAdmin.java | 31 +- .../helix/DefaultCloudEventCallbackImpl.java | 24 +- .../event/helix/HelixEventHandlingUtil.java | 3 + .../trimmer/InstanceConfigTrimmer.java | 16 + .../BaseControllerDataProvider.java | 8 +- .../stages/BestPossibleStateCalcStage.java | 8 +- .../stages/CurrentStateComputationStage.java | 1 + .../apache/helix/manager/zk/ZKHelixAdmin.java | 314 ++++++---------- .../apache/helix/model/InstanceConfig.java | 355 +++++++++++++++--- .../helix/spectator/RoutingDataCache.java | 9 +- .../org/apache/helix/util/InstanceUtil.java | 198 ++++++++++ .../TestDefaultCloudEventCallbackImpl.java | 10 +- .../rebalancer/TestInstanceOperation.java | 37 +- .../helix/manager/zk/TestZkHelixAdmin.java | 62 ++- .../org/apache/helix/mock/MockHelixAdmin.java | 29 +- .../helix/model/TestInstanceConfig.java | 114 +++++- .../StoppableInstancesSelector.java | 4 +- .../resources/helix/PerInstanceAccessor.java | 18 +- .../rest/server/TestPerInstanceAccessor.java | 8 +- 20 files changed, 958 insertions(+), 328 deletions(-) create mode 100644 helix-core/src/main/java/org/apache/helix/util/InstanceUtil.java diff --git a/helix-common/src/main/java/org/apache/helix/constants/InstanceConstants.java b/helix-common/src/main/java/org/apache/helix/constants/InstanceConstants.java index 85c22c460b..22f6c7c76f 100644 --- a/helix-common/src/main/java/org/apache/helix/constants/InstanceConstants.java +++ b/helix-common/src/main/java/org/apache/helix/constants/InstanceConstants.java @@ -20,21 +20,54 @@ public class InstanceConstants { * TODO: Remove this when the deprecated HELIX_ENABLED is removed. */ public static final Set INSTANCE_DISABLED_OVERRIDABLE_OPERATIONS = - ImmutableSet.of(InstanceOperation.ENABLE, InstanceOperation.DISABLE, InstanceOperation.EVACUATE); + ImmutableSet.of(InstanceOperation.ENABLE, InstanceOperation.EVACUATE); /** * The set of InstanceOperations that are not allowed to be populated in the RoutingTableProvider. */ - public static final Set UNSERVABLE_INSTANCE_OPERATIONS = + public static final Set UNROUTABLE_INSTANCE_OPERATIONS = ImmutableSet.of(InstanceOperation.SWAP_IN, InstanceOperation.UNKNOWN); + @Deprecated public enum InstanceDisabledType { CLOUD_EVENT, USER_OPERATION, DEFAULT_INSTANCE_DISABLE_TYPE } + public enum InstanceOperationSource { + ADMIN(0), USER(1), AUTOMATION(2), DEFAULT(3); + + private final int _priority; + + InstanceOperationSource(int priority) { + _priority = priority; + } + + public int getPriority() { + return _priority; + } + + /** + * Convert from InstanceDisabledType to InstanceOperationTrigger + * + * @param disabledType InstanceDisabledType + * @return InstanceOperationTrigger + */ + public static InstanceOperationSource instanceDisabledTypeToInstanceOperationSource( + InstanceDisabledType disabledType) { + switch (disabledType) { + case CLOUD_EVENT: + return InstanceOperationSource.AUTOMATION; + case USER_OPERATION: + return InstanceOperationSource.USER; + default: + return InstanceOperationSource.DEFAULT; + } + } + } + public enum InstanceOperation { /** * Behavior: Replicas will be assigned to the node and will receive upward state transitions if diff --git a/helix-core/src/main/java/org/apache/helix/HelixAdmin.java b/helix-core/src/main/java/org/apache/helix/HelixAdmin.java index 84a7154b18..07afb55b6f 100644 --- a/helix-core/src/main/java/org/apache/helix/HelixAdmin.java +++ b/helix-core/src/main/java/org/apache/helix/HelixAdmin.java @@ -310,15 +310,38 @@ void enableInstance(String clusterName, String instanceName, boolean enabled, void enableInstance(String clusterName, List instances, boolean enabled); /** - * Set the instanceOperation field. Setting it to null is equivalent to - * ENABLE. + * Set the instanceOperation of and instance with {@link InstanceConstants.InstanceOperation}. * * @param clusterName The cluster name * @param instanceName The instance name - * @param instanceOperation The instance operation + * @param instanceOperation The instance operation type */ void setInstanceOperation(String clusterName, String instanceName, - @Nullable InstanceConstants.InstanceOperation instanceOperation); + InstanceConstants.InstanceOperation instanceOperation); + + /** + * Set the instanceOperation of and instance with {@link InstanceConstants.InstanceOperation}. + * + * @param clusterName The cluster name + * @param instanceName The instance name + * @param instanceOperation The instance operation type + * @param reason The reason for the operation + */ + void setInstanceOperation(String clusterName, String instanceName, + InstanceConstants.InstanceOperation instanceOperation, String reason); + + /** + * Set the instanceOperation of and instance with {@link InstanceConstants.InstanceOperation}. + * + * @param clusterName The cluster name + * @param instanceName The instance name + * @param instanceOperation The instance operation type + * @param reason The reason for the operation + * @param overrideAll Whether to override all existing instance operations from all other + * instance operations + */ + void setInstanceOperation(String clusterName, String instanceName, + InstanceConstants.InstanceOperation instanceOperation, String reason, boolean overrideAll); /** * Disable or enable a resource diff --git a/helix-core/src/main/java/org/apache/helix/cloud/event/helix/DefaultCloudEventCallbackImpl.java b/helix-core/src/main/java/org/apache/helix/cloud/event/helix/DefaultCloudEventCallbackImpl.java index 04ad4b798a..20c5001164 100644 --- a/helix-core/src/main/java/org/apache/helix/cloud/event/helix/DefaultCloudEventCallbackImpl.java +++ b/helix-core/src/main/java/org/apache/helix/cloud/event/helix/DefaultCloudEventCallbackImpl.java @@ -23,6 +23,8 @@ import org.apache.helix.HelixManager; import org.apache.helix.constants.InstanceConstants; import org.apache.helix.model.ClusterConfig; +import org.apache.helix.model.InstanceConfig; +import org.apache.helix.util.InstanceUtil; import org.apache.helix.util.InstanceValidationUtil; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -49,9 +51,14 @@ public void disableInstance(HelixManager manager, Object eventInfo) { LOG.info("DefaultCloudEventCallbackImpl disable Instance {}", manager.getInstanceName()); if (InstanceValidationUtil .isEnabled(manager.getHelixDataAccessor(), manager.getInstanceName())) { - manager.getClusterManagmentTool() - .enableInstance(manager.getClusterName(), manager.getInstanceName(), false, - InstanceConstants.InstanceDisabledType.CLOUD_EVENT, message); + InstanceUtil.setInstanceOperation(manager.getConfigAccessor(), + manager.getHelixDataAccessor().getBaseDataAccessor(), manager.getClusterName(), + manager.getInstanceName(), + new InstanceConfig.InstanceOperation.Builder().setOperation( + InstanceConstants.InstanceOperation.DISABLE) + .setSource(InstanceConstants.InstanceOperationSource.AUTOMATION) + .setReason(message) + .build()); } HelixEventHandlingUtil.updateCloudEventOperationInClusterConfig(manager.getClusterName(), manager.getInstanceName(), manager.getHelixDataAccessor().getBaseDataAccessor(), false, @@ -72,10 +79,13 @@ public void enableInstance(HelixManager manager, Object eventInfo) { HelixEventHandlingUtil .updateCloudEventOperationInClusterConfig(manager.getClusterName(), instanceName, manager.getHelixDataAccessor().getBaseDataAccessor(), true, message); - if (HelixEventHandlingUtil.isInstanceDisabledForCloudEvent(instanceName, accessor)) { - manager.getClusterManagmentTool().enableInstance(manager.getClusterName(), instanceName, true, - InstanceConstants.InstanceDisabledType.CLOUD_EVENT, message); - } + InstanceUtil.setInstanceOperation(manager.getConfigAccessor(), + manager.getHelixDataAccessor().getBaseDataAccessor(), manager.getClusterName(), + manager.getInstanceName(), + new InstanceConfig.InstanceOperation.Builder().setOperation( + InstanceConstants.InstanceOperation.ENABLE) + .setSource(InstanceConstants.InstanceOperationSource.AUTOMATION).setReason(message) + .build()); } /** diff --git a/helix-core/src/main/java/org/apache/helix/cloud/event/helix/HelixEventHandlingUtil.java b/helix-core/src/main/java/org/apache/helix/cloud/event/helix/HelixEventHandlingUtil.java index ee96a13ee7..ceff1d299c 100644 --- a/helix-core/src/main/java/org/apache/helix/cloud/event/helix/HelixEventHandlingUtil.java +++ b/helix-core/src/main/java/org/apache/helix/cloud/event/helix/HelixEventHandlingUtil.java @@ -48,7 +48,10 @@ class HelixEventHandlingUtil { * @param dataAccessor * @return return true only when instance is Helix disabled and the disabled reason in * instanceConfig is cloudEvent + * @deprecated No need to check this if using InstanceOperation and specifying the trigger as CLOUD + * when enabling. */ + @Deprecated static boolean isInstanceDisabledForCloudEvent(String instanceName, HelixDataAccessor dataAccessor) { InstanceConfig instanceConfig = diff --git a/helix-core/src/main/java/org/apache/helix/controller/changedetector/trimmer/InstanceConfigTrimmer.java b/helix-core/src/main/java/org/apache/helix/controller/changedetector/trimmer/InstanceConfigTrimmer.java index 45b0dde766..cd2b16f922 100644 --- a/helix-core/src/main/java/org/apache/helix/controller/changedetector/trimmer/InstanceConfigTrimmer.java +++ b/helix-core/src/main/java/org/apache/helix/controller/changedetector/trimmer/InstanceConfigTrimmer.java @@ -19,6 +19,7 @@ * under the License. */ +import java.util.HashMap; import java.util.Map; import java.util.Set; @@ -62,6 +63,21 @@ protected Map> getNonTrimmableFields(InstanceConfig insta return STATIC_TOPOLOGY_RELATED_FIELD_MAP; } + /** + * We should trim HELIX_INSTANCE_OPERATIONS field, it is used to filter instances in the + * BaseControllerDataProvider. That filtering will be used to determine if ResourceChangeSnapshot + * has changed as opposed to checking the actual value of the field. + * + * @param property the instance config + * @return a map contains all non-trimmable field keys that need to be kept. + */ + protected Map> getNonTrimmableKeys(InstanceConfig property) { + Map> nonTrimmableKeys = super.getNonTrimmableKeys(property); + nonTrimmableKeys.get(FieldType.LIST_FIELD) + .remove(InstanceConfigProperty.HELIX_INSTANCE_OPERATIONS.name()); + return nonTrimmableKeys; + } + @Override public InstanceConfig trimProperty(InstanceConfig property) { return new InstanceConfig(doTrim(property)); diff --git a/helix-core/src/main/java/org/apache/helix/controller/dataproviders/BaseControllerDataProvider.java b/helix-core/src/main/java/org/apache/helix/controller/dataproviders/BaseControllerDataProvider.java index a91ae12d27..ce5d3de8c7 100644 --- a/helix-core/src/main/java/org/apache/helix/controller/dataproviders/BaseControllerDataProvider.java +++ b/helix-core/src/main/java/org/apache/helix/controller/dataproviders/BaseControllerDataProvider.java @@ -413,14 +413,15 @@ private void updateInstanceSets(Map instanceConfigMap, currentInstanceConfig.getLogicalId(clusterTopologyConfig.getEndNodeType()); newInstanceConfigMapByInstanceOperation.computeIfAbsent( - currentInstanceConfig.getInstanceOperation(), k -> new HashMap<>()) + currentInstanceConfig.getInstanceOperation().getOperation(), + k -> new HashMap<>()) .put(node, currentInstanceConfig); if (currentInstanceConfig.isAssignable()) { newAssignableInstanceConfigMap.put(node, currentInstanceConfig); } - if (currentInstanceConfig.getInstanceOperation() + if (currentInstanceConfig.getInstanceOperation().getOperation() .equals(InstanceConstants.InstanceOperation.SWAP_IN)) { swapInLogicalIdsByInstanceName.put(currentInstanceConfig.getInstanceName(), currentInstanceLogicalId); @@ -1079,7 +1080,8 @@ private void updateDisabledInstances(Collection allInstanceConfi _disabledInstanceSet.clear(); for (InstanceConfig config : allInstanceConfigs) { Map> disabledPartitionMap = config.getDisabledPartitionsMap(); - if (config.getInstanceOperation().equals(InstanceConstants.InstanceOperation.DISABLE)) { + if (config.getInstanceOperation().getOperation() + .equals(InstanceConstants.InstanceOperation.DISABLE)) { _disabledInstanceSet.add(config.getInstanceName()); } for (String resource : disabledPartitionMap.keySet()) { diff --git a/helix-core/src/main/java/org/apache/helix/controller/stages/BestPossibleStateCalcStage.java b/helix-core/src/main/java/org/apache/helix/controller/stages/BestPossibleStateCalcStage.java index 1db0eccfca..714e9325d1 100644 --- a/helix-core/src/main/java/org/apache/helix/controller/stages/BestPossibleStateCalcStage.java +++ b/helix-core/src/main/java/org/apache/helix/controller/stages/BestPossibleStateCalcStage.java @@ -36,7 +36,6 @@ import org.apache.helix.HelixRebalanceException; import org.apache.helix.constants.InstanceConstants; import org.apache.helix.controller.LogUtil; -import org.apache.helix.controller.common.ResourcesStateMap; import org.apache.helix.controller.dataproviders.ResourceControllerDataProvider; import org.apache.helix.controller.pipeline.AbstractBaseStage; import org.apache.helix.controller.pipeline.StageException; @@ -358,11 +357,12 @@ private boolean validateInstancesUnableToAcceptOnlineReplicasLimit(final Resourc if (maxInstancesUnableToAcceptOnlineReplicas >= 0) { // Instead of only checking the offline instances, we consider how many instances in the cluster // are not assignable and live. This is because some instances may be online but have an unassignable - // InstanceOperation such as EVACUATE, DISABLE, or UNKNOWN. We will exclude SWAP_IN instances from + // InstanceOperation such as EVACUATE, and DISABLE. We will exclude SWAP_IN and UNKNOWN instances from // they should not account against the capacity of the cluster. int instancesUnableToAcceptOnlineReplicas = cache.getInstanceConfigMap().entrySet().stream() - .filter(instanceEntry -> !InstanceConstants.UNSERVABLE_INSTANCE_OPERATIONS.contains( - instanceEntry.getValue().getInstanceOperation())).collect(Collectors.toSet()) + .filter(instanceEntry -> !InstanceConstants.UNROUTABLE_INSTANCE_OPERATIONS.contains( + instanceEntry.getValue().getInstanceOperation().getOperation())) + .collect(Collectors.toSet()) .size() - cache.getEnabledLiveInstances().size(); if (instancesUnableToAcceptOnlineReplicas > maxInstancesUnableToAcceptOnlineReplicas) { String errMsg = String.format( diff --git a/helix-core/src/main/java/org/apache/helix/controller/stages/CurrentStateComputationStage.java b/helix-core/src/main/java/org/apache/helix/controller/stages/CurrentStateComputationStage.java index 3bf23d22ef..da972d682c 100644 --- a/helix-core/src/main/java/org/apache/helix/controller/stages/CurrentStateComputationStage.java +++ b/helix-core/src/main/java/org/apache/helix/controller/stages/CurrentStateComputationStage.java @@ -109,6 +109,7 @@ public void process(ClusterEvent event) throws Exception { // Only update the currentStateExcludingUnknown if the instance is not in UNKNOWN InstanceOperation. if (instanceConfig == null || !instanceConfig.getInstanceOperation() + .getOperation() .equals(InstanceConstants.InstanceOperation.UNKNOWN)) { // update current states. updateCurrentStates(instance, diff --git a/helix-core/src/main/java/org/apache/helix/manager/zk/ZKHelixAdmin.java b/helix-core/src/main/java/org/apache/helix/manager/zk/ZKHelixAdmin.java index 8c873b4cdb..39ae9ae67c 100644 --- a/helix-core/src/main/java/org/apache/helix/manager/zk/ZKHelixAdmin.java +++ b/helix-core/src/main/java/org/apache/helix/manager/zk/ZKHelixAdmin.java @@ -42,7 +42,6 @@ import javax.annotation.Nullable; import com.google.common.collect.ImmutableSet; -import org.apache.commons.lang3.builder.EqualsBuilder; import org.apache.helix.AccessOption; import org.apache.helix.BaseDataAccessor; import org.apache.helix.ConfigAccessor; @@ -69,7 +68,6 @@ import org.apache.helix.model.ClusterConstraints; import org.apache.helix.model.ClusterConstraints.ConstraintType; import org.apache.helix.model.ClusterStatus; -import org.apache.helix.model.ClusterTopologyConfig; import org.apache.helix.model.ConstraintItem; import org.apache.helix.model.ControllerHistory; import org.apache.helix.model.CurrentState; @@ -89,11 +87,11 @@ import org.apache.helix.model.PauseSignal; import org.apache.helix.model.ResourceConfig; import org.apache.helix.model.StateModelDefinition; -import org.apache.helix.model.builder.HelixConfigScopeBuilder; import org.apache.helix.msdcommon.exception.InvalidRoutingDataException; import org.apache.helix.tools.DefaultIdealStateCalculator; import org.apache.helix.util.ConfigStringUtil; import org.apache.helix.util.HelixUtil; +import org.apache.helix.util.InstanceUtil; import org.apache.helix.util.RebalanceUtil; import org.apache.helix.zookeeper.api.client.HelixZkClient; import org.apache.helix.zookeeper.api.client.RealmAwareZkClient; @@ -125,6 +123,7 @@ public class ZKHelixAdmin implements HelixAdmin { private final RealmAwareZkClient _zkClient; private final ConfigAccessor _configAccessor; + private final BaseDataAccessor _baseDataAccessor; // true if ZKHelixAdmin was instantiated with a RealmAwareZkClient, false otherwise // This is used for close() to determine how ZKHelixAdmin should close the underlying ZkClient private final boolean _usesExternalZkClient; @@ -142,6 +141,7 @@ public class ZKHelixAdmin implements HelixAdmin { public ZKHelixAdmin(RealmAwareZkClient zkClient) { _zkClient = zkClient; _configAccessor = new ConfigAccessor(zkClient); + _baseDataAccessor = new ZkBaseDataAccessor<>(zkClient); _usesExternalZkClient = true; } @@ -182,12 +182,14 @@ public ZKHelixAdmin(String zkAddress) { _zkClient = zkClient; _configAccessor = new ConfigAccessor(_zkClient); + _baseDataAccessor = new ZkBaseDataAccessor<>(zkClient); _usesExternalZkClient = false; } private ZKHelixAdmin(RealmAwareZkClient zkClient, boolean usesExternalZkClient) { _zkClient = zkClient; _configAccessor = new ConfigAccessor(_zkClient); + _baseDataAccessor = new ZkBaseDataAccessor<>(zkClient); _usesExternalZkClient = usesExternalZkClient; } @@ -206,7 +208,8 @@ public void addInstance(String clusterName, InstanceConfig instanceConfig) { } List matchingLogicalIdInstances = - findInstancesMatchingLogicalId(clusterName, instanceConfig); + InstanceUtil.findInstancesWithMatchingLogicalId(_configAccessor, clusterName, + instanceConfig); if (matchingLogicalIdInstances.size() > 1) { throw new HelixException( "There are already more than one instance with the same logicalId in the cluster: " @@ -216,17 +219,16 @@ public void addInstance(String clusterName, InstanceConfig instanceConfig) { } InstanceConstants.InstanceOperation attemptedInstanceOperation = - instanceConfig.getInstanceOperation(); + instanceConfig.getInstanceOperation().getOperation(); try { - validateInstanceOperationTransition(instanceConfig, - !matchingLogicalIdInstances.isEmpty() ? matchingLogicalIdInstances.get(0) : null, - InstanceConstants.InstanceOperation.UNKNOWN, - attemptedInstanceOperation, clusterName); + InstanceUtil.validateInstanceOperationTransition(_configAccessor, clusterName, instanceConfig, + InstanceConstants.InstanceOperation.UNKNOWN, attemptedInstanceOperation); } catch (HelixException e) { instanceConfig.setInstanceOperation(InstanceConstants.InstanceOperation.UNKNOWN); logger.error("Failed to add instance " + instanceConfig.getInstanceName() + " to cluster " + clusterName + " with instance operation " + attemptedInstanceOperation + ". Setting INSTANCE_OPERATION to " + instanceConfig.getInstanceOperation() + .getOperation() + " instead.", e); } @@ -240,8 +242,7 @@ public void addInstance(String clusterName, InstanceConfig instanceConfig) { _zkClient.createPersistent(PropertyPathBuilder.instanceError(clusterName, nodeId), true); _zkClient.createPersistent(PropertyPathBuilder.instanceStatusUpdate(clusterName, nodeId), true); _zkClient.createPersistent(PropertyPathBuilder.instanceHistory(clusterName, nodeId), true); - HelixDataAccessor accessor = - new ZKHelixDataAccessor(clusterName, new ZkBaseDataAccessor(_zkClient)); + HelixDataAccessor accessor = new ZKHelixDataAccessor(clusterName, _baseDataAccessor); PropertyKey.Builder keyBuilder = accessor.keyBuilder(); accessor.setProperty(keyBuilder.participantHistory(nodeId), new ParticipantHistory(nodeId)); } @@ -344,8 +345,7 @@ public InstanceConfig getInstanceConfig(String clusterName, String instanceName) "instance" + instanceName + " does not exist in cluster " + clusterName); } - HelixDataAccessor accessor = - new ZKHelixDataAccessor(clusterName, new ZkBaseDataAccessor(_zkClient)); + HelixDataAccessor accessor = new ZKHelixDataAccessor(clusterName, _baseDataAccessor); PropertyKey.Builder keyBuilder = accessor.keyBuilder(); return accessor.getProperty(keyBuilder.instanceConfig(instanceName)); @@ -364,8 +364,7 @@ public boolean setInstanceConfig(String clusterName, String instanceName, "instance" + instanceName + " does not exist in cluster " + clusterName); } - HelixDataAccessor accessor = - new ZKHelixDataAccessor(clusterName, new ZkBaseDataAccessor(_zkClient)); + HelixDataAccessor accessor = new ZKHelixDataAccessor(clusterName, _baseDataAccessor); PropertyKey instanceConfigPropertyKey = accessor.keyBuilder().instanceConfig(instanceName); InstanceConfig currentInstanceConfig = accessor.getProperty(instanceConfigPropertyKey); if (!newInstanceConfig.getHostName().equals(currentInstanceConfig.getHostName()) @@ -397,9 +396,6 @@ public void enableInstance(final String clusterName, final String instanceName, // Eventually we will have all instances' enable/disable information in clusterConfig. Now we // update both instanceConfig and clusterConfig in transition period. enableSingleInstance(clusterName, instanceName, enabled, baseAccessor, disabledType, reason); -// enableBatchInstances(clusterName, Collections.singletonList(instanceName), enabled, -// baseAccessor, disabledType, reason); - } @Deprecated @@ -413,62 +409,6 @@ public void enableInstance(String clusterName, List instances, boolean e //enableInstance(clusterName, instances, enabled, null, null); } - private void validateInstanceOperationTransition(InstanceConfig instanceConfig, - InstanceConfig matchingLogicalIdInstance, - InstanceConstants.InstanceOperation currentOperation, - InstanceConstants.InstanceOperation targetOperation, - String clusterName) { - boolean targetStateEnableOrDisable = - targetOperation.equals(InstanceConstants.InstanceOperation.ENABLE) - || targetOperation.equals(InstanceConstants.InstanceOperation.DISABLE); - switch (currentOperation) { - case ENABLE: - case DISABLE: - // ENABLE or DISABLE can be set to ENABLE, DISABLE, or EVACUATE at any time. - if (ImmutableSet.of(InstanceConstants.InstanceOperation.ENABLE, - InstanceConstants.InstanceOperation.DISABLE, - InstanceConstants.InstanceOperation.EVACUATE).contains(targetOperation)) { - return; - } - case SWAP_IN: - // We can only ENABLE or DISABLE a SWAP_IN instance if there is an instance with matching logicalId - // with an InstanceOperation set to UNKNOWN. - if ((targetStateEnableOrDisable && (matchingLogicalIdInstance == null - || matchingLogicalIdInstance.getInstanceOperation() - .equals(InstanceConstants.InstanceOperation.UNKNOWN))) || targetOperation.equals( - InstanceConstants.InstanceOperation.UNKNOWN)) { - return; - } - case EVACUATE: - // EVACUATE can only be set to ENABLE or DISABLE when there is no instance with the same - // logicalId in the cluster. - if ((targetStateEnableOrDisable && matchingLogicalIdInstance == null) - || targetOperation.equals(InstanceConstants.InstanceOperation.UNKNOWN)) { - return; - } - case UNKNOWN: - // UNKNOWN can be set to ENABLE or DISABLE when there is no instance with the same logicalId in the cluster - // or the instance with the same logicalId in the cluster has InstanceOperation set to EVACUATE. - // UNKNOWN can be set to SWAP_IN when there is an instance with the same logicalId in the cluster set to ENABLE, - // or DISABLE. - if ((targetStateEnableOrDisable && (matchingLogicalIdInstance == null - || matchingLogicalIdInstance.getInstanceOperation() - .equals(InstanceConstants.InstanceOperation.EVACUATE)))) { - return; - } else if (targetOperation.equals(InstanceConstants.InstanceOperation.SWAP_IN) - && matchingLogicalIdInstance != null && !ImmutableSet.of( - InstanceConstants.InstanceOperation.UNKNOWN, - InstanceConstants.InstanceOperation.EVACUATE) - .contains(matchingLogicalIdInstance.getInstanceOperation())) { - return; - } - default: - throw new HelixException( - "InstanceOperation cannot be set to " + targetOperation + " when the instance is in " - + currentOperation + " state"); - } - } - /** * Set the InstanceOperation of an instance in the cluster. * @@ -479,75 +419,57 @@ private void validateInstanceOperationTransition(InstanceConfig instanceConfig, @Override public void setInstanceOperation(String clusterName, String instanceName, @Nullable InstanceConstants.InstanceOperation instanceOperation) { + setInstanceOperation(clusterName, instanceName, instanceOperation, null, false); + } - BaseDataAccessor baseAccessor = new ZkBaseDataAccessor<>(_zkClient); - String path = PropertyPathBuilder.instanceConfig(clusterName, instanceName); - - InstanceConfig instanceConfig = getInstanceConfig(clusterName, instanceName); - if (instanceConfig == null) { - throw new HelixException("Cluster " + clusterName + ", instance: " + instanceName - + ", instance config does not exist"); - } - List matchingLogicalIdInstances = - findInstancesMatchingLogicalId(clusterName, instanceConfig); - validateInstanceOperationTransition(instanceConfig, - !matchingLogicalIdInstances.isEmpty() ? matchingLogicalIdInstances.get(0) : null, - instanceConfig.getInstanceOperation(), - instanceOperation == null ? InstanceConstants.InstanceOperation.ENABLE : instanceOperation, - clusterName); - - boolean succeeded = baseAccessor.update(path, new DataUpdater() { - @Override - public ZNRecord update(ZNRecord currentData) { - if (currentData == null) { - throw new HelixException( - "Cluster: " + clusterName + ", instance: " + instanceName + ", participant config is null"); - } - - InstanceConfig config = new InstanceConfig(currentData); - config.setInstanceOperation(instanceOperation); - return config.getRecord(); - } - }, AccessOption.PERSISTENT); + /** + * Set the instanceOperation of and instance with {@link InstanceConstants.InstanceOperation}. + * + * @param clusterName The cluster name + * @param instanceName The instance name + * @param instanceOperation The instance operation type + * @param reason The reason for the operation + */ + @Override + public void setInstanceOperation(String clusterName, String instanceName, + @Nullable InstanceConstants.InstanceOperation instanceOperation, String reason) { + setInstanceOperation(clusterName, instanceName, instanceOperation, reason, false); + } - if (!succeeded) { - throw new HelixException("Failed to update instance operation. Please check if instance is disabled."); - } + /** + * Set the instanceOperation of and instance with {@link InstanceConstants.InstanceOperation}. + * + * @param clusterName The cluster name + * @param instanceName The instance name + * @param instanceOperation The instance operation type + * @param reason The reason for the operation + * @param overrideAll Whether to override all existing instance operations from all other + * instance operations + */ + @Override + public void setInstanceOperation(String clusterName, String instanceName, + @Nullable InstanceConstants.InstanceOperation instanceOperation, String reason, + boolean overrideAll) { + InstanceConfig.InstanceOperation instanceOperationObj = + new InstanceConfig.InstanceOperation.Builder().setOperation( + instanceOperation == null ? InstanceConstants.InstanceOperation.ENABLE + : instanceOperation).setReason(reason).setSource( + overrideAll ? InstanceConstants.InstanceOperationSource.ADMIN + : InstanceConstants.InstanceOperationSource.USER).build(); + InstanceUtil.setInstanceOperation(_configAccessor, _baseDataAccessor, clusterName, instanceName, + instanceOperationObj); } @Override public boolean isEvacuateFinished(String clusterName, String instanceName) { if (!instanceHasFullAutoCurrentStateOrMessage(clusterName, instanceName)) { InstanceConfig config = getInstanceConfig(clusterName, instanceName); - return config != null && config.getInstanceOperation() + return config != null && config.getInstanceOperation().getOperation() .equals(InstanceConstants.InstanceOperation.EVACUATE); } return false; } - /** - * Find the instance that the passed instance has a matching logicalId with. - * - * @param clusterName The cluster name - * @param instanceConfig The instance to find the matching instance for - * @return The matching instance if found, null otherwise. - */ - private List findInstancesMatchingLogicalId(String clusterName, - InstanceConfig instanceConfig) { - String logicalIdKey = - ClusterTopologyConfig.createFromClusterConfig(_configAccessor.getClusterConfig(clusterName)) - .getEndNodeType(); - return getConfigKeys( - new HelixConfigScopeBuilder(HelixConfigScope.ConfigScopeProperty.PARTICIPANT, - clusterName).build()).stream() - .map(instanceName -> getInstanceConfig(clusterName, instanceName)).filter( - potentialInstanceConfig -> - !potentialInstanceConfig.getInstanceName().equals(instanceConfig.getInstanceName()) - && potentialInstanceConfig.getLogicalId(logicalIdKey) - .equals(instanceConfig.getLogicalId(logicalIdKey))) - .collect(Collectors.toList()); - } - /** * Check to see if swapping between two instances is ready to be completed. Checks: 1. Both * instances must be alive. 2. Both instances must only have one session and not be carrying over @@ -563,7 +485,7 @@ private List findInstancesMatchingLogicalId(String clusterName, */ private boolean canCompleteSwap(String clusterName, String swapOutInstanceName, String swapInInstanceName) { - BaseDataAccessor baseAccessor = new ZkBaseDataAccessor(_zkClient); + BaseDataAccessor baseAccessor = _baseDataAccessor; HelixDataAccessor accessor = new ZKHelixDataAccessor(clusterName, baseAccessor); PropertyKey.Builder keyBuilder = accessor.keyBuilder(); @@ -579,8 +501,8 @@ private boolean canCompleteSwap(String clusterName, String swapOutInstanceName, "SwapOutInstance {} is {} + {} and SwapInInstance {} is OFFLINE + {} for cluster {}. Swap will" + " not complete unless SwapInInstance instance is ONLINE.", swapOutInstanceName, swapOutLiveInstance != null ? "ONLINE" : "OFFLINE", - swapOutInstanceConfig.getInstanceOperation(), swapInInstanceName, - swapInInstanceConfig.getInstanceOperation(), clusterName); + swapOutInstanceConfig.getInstanceOperation().getOperation(), swapInInstanceName, + swapInInstanceConfig.getInstanceOperation().getOperation(), clusterName); return false; } @@ -619,7 +541,7 @@ private boolean canCompleteSwap(String clusterName, String swapOutInstanceName, // 4. If the swap-out instance is not alive or is disabled, we return true without checking // the current states on the swap-in instance. - if (swapOutLiveInstance == null || swapOutInstanceConfig.getInstanceOperation() + if (swapOutLiveInstance == null || swapOutInstanceConfig.getInstanceOperation().getOperation() .equals(InstanceConstants.InstanceOperation.DISABLE)) { return true; } @@ -697,7 +619,8 @@ public boolean canCompleteSwap(String clusterName, String instanceName) { } List swappingInstances = - findInstancesMatchingLogicalId(clusterName, instanceConfig); + InstanceUtil.findInstancesWithMatchingLogicalId(_configAccessor, clusterName, + instanceConfig); if (swappingInstances.size() != 1) { logger.warn( "Instance {} in cluster {} is not swapping with any other instance. Cannot determine if the swap is complete.", @@ -705,10 +628,10 @@ public boolean canCompleteSwap(String clusterName, String instanceName) { return false; } - InstanceConfig swapOutInstanceConfig = - !instanceConfig.getInstanceOperation().equals(InstanceConstants.InstanceOperation.SWAP_IN) + InstanceConfig swapOutInstanceConfig = !instanceConfig.getInstanceOperation().getOperation() + .equals(InstanceConstants.InstanceOperation.SWAP_IN) ? instanceConfig : swappingInstances.get(0); - InstanceConfig swapInInstanceConfig = instanceConfig.getInstanceOperation() + InstanceConfig swapInInstanceConfig = instanceConfig.getInstanceOperation().getOperation() .equals(InstanceConstants.InstanceOperation.SWAP_IN) ? instanceConfig : swappingInstances.get(0); if (swapOutInstanceConfig == null || swapInInstanceConfig == null) { @@ -735,7 +658,8 @@ public boolean completeSwapIfPossible(String clusterName, String instanceName, } List swappingInstances = - findInstancesMatchingLogicalId(clusterName, instanceConfig); + InstanceUtil.findInstancesWithMatchingLogicalId(_configAccessor, clusterName, + instanceConfig); if (swappingInstances.size() != 1) { logger.warn( "Instance {} in cluster {} is not swapping with any other instance. Cannot determine if the swap is complete.", @@ -743,10 +667,10 @@ public boolean completeSwapIfPossible(String clusterName, String instanceName, return false; } - InstanceConfig swapOutInstanceConfig = - !instanceConfig.getInstanceOperation().equals(InstanceConstants.InstanceOperation.SWAP_IN) + InstanceConfig swapOutInstanceConfig = !instanceConfig.getInstanceOperation().getOperation() + .equals(InstanceConstants.InstanceOperation.SWAP_IN) ? instanceConfig : swappingInstances.get(0); - InstanceConfig swapInInstanceConfig = instanceConfig.getInstanceOperation() + InstanceConfig swapInInstanceConfig = instanceConfig.getInstanceOperation().getOperation() .equals(InstanceConstants.InstanceOperation.SWAP_IN) ? instanceConfig : swappingInstances.get(0); if (swapOutInstanceConfig == null || swapInInstanceConfig == null) { @@ -802,7 +726,7 @@ public boolean isReadyForPreparingJoiningCluster(String clusterName, String inst if (!instanceHasFullAutoCurrentStateOrMessage(clusterName, instanceName)) { InstanceConfig config = getInstanceConfig(clusterName, instanceName); return config != null && INSTANCE_OPERATION_TO_EXCLUDE_FROM_ASSIGNMENT.contains( - config.getInstanceOperation()); + config.getInstanceOperation().getOperation()); } return false; } @@ -816,7 +740,7 @@ public boolean isReadyForPreparingJoiningCluster(String clusterName, String inst */ private boolean instanceHasFullAutoCurrentStateOrMessage(String clusterName, String instanceName) { - HelixDataAccessor accessor = new ZKHelixDataAccessor(clusterName, new ZkBaseDataAccessor(_zkClient)); + HelixDataAccessor accessor = new ZKHelixDataAccessor(clusterName, _baseDataAccessor); PropertyKey.Builder keyBuilder = accessor.keyBuilder(); // check the instance is alive @@ -827,7 +751,7 @@ private boolean instanceHasFullAutoCurrentStateOrMessage(String clusterName, return false; } - BaseDataAccessor baseAccessor = new ZkBaseDataAccessor(_zkClient); + BaseDataAccessor baseAccessor = _baseDataAccessor; // count number of sessions under CurrentState folder. If it is carrying over from prv session, // then there are > 1 session ZNodes. List sessions = baseAccessor.getChildNames(PropertyPathBuilder.instanceCurrentState(clusterName, instanceName), 0); @@ -867,7 +791,7 @@ private boolean instanceHasFullAutoCurrentStateOrMessage(String clusterName, public void enableResource(final String clusterName, final String resourceName, final boolean enabled) { logger.info("{} resource {} in cluster {}.", enabled ? "Enable" : "Disable", resourceName, clusterName); String path = PropertyPathBuilder.idealState(clusterName, resourceName); - BaseDataAccessor baseAccessor = new ZkBaseDataAccessor(_zkClient); + BaseDataAccessor baseAccessor = _baseDataAccessor; if (!baseAccessor.exists(path, 0)) { throw new HelixException("Cluster " + clusterName + ", resource: " + resourceName + ", ideal-state does not exist"); @@ -894,7 +818,7 @@ public void enablePartition(final boolean enabled, final String clusterName, instanceName, clusterName); String path = PropertyPathBuilder.instanceConfig(clusterName, instanceName); - BaseDataAccessor baseAccessor = new ZkBaseDataAccessor(_zkClient); + BaseDataAccessor baseAccessor = _baseDataAccessor; // check instanceConfig exists if (!baseAccessor.exists(path, 0)) { @@ -973,8 +897,7 @@ public void enableCluster(String clusterName, boolean enabled) { public void enableCluster(String clusterName, boolean enabled, String reason) { logger.info("{} cluster {} for reason {}.", enabled ? "Enable" : "Disable", clusterName, reason == null ? "NULL" : reason); - HelixDataAccessor accessor = - new ZKHelixDataAccessor(clusterName, new ZkBaseDataAccessor(_zkClient)); + HelixDataAccessor accessor = new ZKHelixDataAccessor(clusterName, _baseDataAccessor); PropertyKey.Builder keyBuilder = accessor.keyBuilder(); if (enabled) { @@ -998,8 +921,7 @@ public void enableMaintenanceMode(String clusterName, boolean enabled) { @Override public boolean isInMaintenanceMode(String clusterName) { - HelixDataAccessor accessor = - new ZKHelixDataAccessor(clusterName, new ZkBaseDataAccessor(_zkClient)); + HelixDataAccessor accessor = new ZKHelixDataAccessor(clusterName, _baseDataAccessor); PropertyKey.Builder keyBuilder = accessor.keyBuilder(); return accessor.getBaseDataAccessor() .exists(keyBuilder.maintenance().getPath(), AccessOption.PERSISTENT); @@ -1248,8 +1170,7 @@ private void processMaintenanceMode(String clusterName, final boolean enabled, final String reason, final MaintenanceSignal.AutoTriggerReason internalReason, final Map customFields, final MaintenanceSignal.TriggeringEntity triggeringEntity) { - HelixDataAccessor accessor = - new ZKHelixDataAccessor(clusterName, new ZkBaseDataAccessor(_zkClient)); + HelixDataAccessor accessor = new ZKHelixDataAccessor(clusterName, _baseDataAccessor); PropertyKey.Builder keyBuilder = accessor.keyBuilder(); logger.info("Cluster {} {} {} maintenance mode for reason {}.", clusterName, triggeringEntity == MaintenanceSignal.TriggeringEntity.CONTROLLER ? "automatically" @@ -1512,8 +1433,7 @@ public List getInstancesInClusterWithTag(String clusterName, String tag) List instances = _zkClient.getChildren(memberInstancesPath); List result = new ArrayList(); - HelixDataAccessor accessor = - new ZKHelixDataAccessor(clusterName, new ZkBaseDataAccessor(_zkClient)); + HelixDataAccessor accessor = new ZKHelixDataAccessor(clusterName, _baseDataAccessor); PropertyKey.Builder keyBuilder = accessor.keyBuilder(); for (String instanceName : instances) { @@ -1659,8 +1579,7 @@ public List getResourcesInCluster(String clusterName) { public List getResourcesInClusterWithTag(String clusterName, String tag) { List resourcesWithTag = new ArrayList(); - HelixDataAccessor accessor = - new ZKHelixDataAccessor(clusterName, new ZkBaseDataAccessor(_zkClient)); + HelixDataAccessor accessor = new ZKHelixDataAccessor(clusterName, _baseDataAccessor); PropertyKey.Builder keyBuilder = accessor.keyBuilder(); for (String resourceName : getResourcesInCluster(clusterName)) { @@ -1675,8 +1594,7 @@ public List getResourcesInClusterWithTag(String clusterName, String tag) @Override public IdealState getResourceIdealState(String clusterName, String resourceName) { - HelixDataAccessor accessor = - new ZKHelixDataAccessor(clusterName, new ZkBaseDataAccessor(_zkClient)); + HelixDataAccessor accessor = new ZKHelixDataAccessor(clusterName, _baseDataAccessor); PropertyKey.Builder keyBuilder = accessor.keyBuilder(); return accessor.getProperty(keyBuilder.idealStates(resourceName)); @@ -1688,8 +1606,7 @@ public void setResourceIdealState(String clusterName, String resourceName, logger .info("Set IdealState for resource {} in cluster {} with new IdealState {}.", resourceName, clusterName, idealState == null ? "NULL" : idealState.toString()); - HelixDataAccessor accessor = - new ZKHelixDataAccessor(clusterName, new ZkBaseDataAccessor(_zkClient)); + HelixDataAccessor accessor = new ZKHelixDataAccessor(clusterName, _baseDataAccessor); PropertyKey.Builder keyBuilder = accessor.keyBuilder(); accessor.setProperty(keyBuilder.idealStates(resourceName), idealState); @@ -1731,8 +1648,7 @@ public void removeFromIdealState(String clusterName, String resourceName, IdealS @Override public ExternalView getResourceExternalView(String clusterName, String resourceName) { - HelixDataAccessor accessor = - new ZKHelixDataAccessor(clusterName, new ZkBaseDataAccessor(_zkClient)); + HelixDataAccessor accessor = new ZKHelixDataAccessor(clusterName, _baseDataAccessor); PropertyKey.Builder keyBuilder = accessor.keyBuilder(); return accessor.getProperty(keyBuilder.externalView(resourceName)); } @@ -1740,8 +1656,7 @@ public ExternalView getResourceExternalView(String clusterName, String resourceN @Override public CustomizedView getResourceCustomizedView(String clusterName, String resourceName, String customizedStateType) { - HelixDataAccessor accessor = - new ZKHelixDataAccessor(clusterName, new ZkBaseDataAccessor(_zkClient)); + HelixDataAccessor accessor = new ZKHelixDataAccessor(clusterName, _baseDataAccessor); PropertyKey.Builder keyBuilder = accessor.keyBuilder(); return accessor.getProperty(keyBuilder.customizedView(customizedStateType, resourceName)); } @@ -1774,8 +1689,7 @@ public void addStateModelDef(String clusterName, String stateModelDef, } } - HelixDataAccessor accessor = - new ZKHelixDataAccessor(clusterName, new ZkBaseDataAccessor(_zkClient)); + HelixDataAccessor accessor = new ZKHelixDataAccessor(clusterName, _baseDataAccessor); PropertyKey.Builder keyBuilder = accessor.keyBuilder(); accessor.setProperty(keyBuilder.stateModelDef(stateModelDef), stateModel); } @@ -1786,8 +1700,7 @@ public void dropResource(String clusterName, String resourceName) { if (!ZKUtil.isClusterSetup(clusterName, _zkClient)) { throw new HelixException("Cluster " + clusterName + " is not setup yet"); } - HelixDataAccessor accessor = - new ZKHelixDataAccessor(clusterName, new ZkBaseDataAccessor(_zkClient)); + HelixDataAccessor accessor = new ZKHelixDataAccessor(clusterName, _baseDataAccessor); PropertyKey.Builder keyBuilder = accessor.keyBuilder(); accessor.removeProperty(keyBuilder.idealStates(resourceName)); @@ -1806,8 +1719,7 @@ public void addCloudConfig(String clusterName, CloudConfig cloudConfig) { CloudConfig.Builder builder = new CloudConfig.Builder(cloudConfig); CloudConfig cloudConfigBuilder = builder.build(); - ZKHelixDataAccessor accessor = - new ZKHelixDataAccessor(clusterName, new ZkBaseDataAccessor(_zkClient)); + ZKHelixDataAccessor accessor = new ZKHelixDataAccessor(clusterName, _baseDataAccessor); PropertyKey.Builder keyBuilder = accessor.keyBuilder(); accessor.setProperty(keyBuilder.cloudConfig(), cloudConfigBuilder); } @@ -1815,8 +1727,7 @@ public void addCloudConfig(String clusterName, CloudConfig cloudConfig) { @Override public void removeCloudConfig(String clusterName) { logger.info("Remove Cloud Config for cluster {}.", clusterName); - HelixDataAccessor accessor = - new ZKHelixDataAccessor(clusterName, new ZkBaseDataAccessor(_zkClient)); + HelixDataAccessor accessor = new ZKHelixDataAccessor(clusterName, _baseDataAccessor); PropertyKey.Builder keyBuilder = accessor.keyBuilder(); accessor.removeProperty(keyBuilder.cloudConfig()); } @@ -1847,8 +1758,7 @@ public List getStateModelDefs(String clusterName) { @Override public StateModelDefinition getStateModelDef(String clusterName, String stateModelName) { - HelixDataAccessor accessor = - new ZKHelixDataAccessor(clusterName, new ZkBaseDataAccessor(_zkClient)); + HelixDataAccessor accessor = new ZKHelixDataAccessor(clusterName, _baseDataAccessor); PropertyKey.Builder keyBuilder = accessor.keyBuilder(); return accessor.getProperty(keyBuilder.stateModelDef(stateModelName)); @@ -1857,8 +1767,7 @@ public StateModelDefinition getStateModelDef(String clusterName, String stateMod @Override public void dropCluster(String clusterName) { logger.info("Deleting cluster {}.", clusterName); - HelixDataAccessor accessor = - new ZKHelixDataAccessor(clusterName, new ZkBaseDataAccessor(_zkClient)); + HelixDataAccessor accessor = new ZKHelixDataAccessor(clusterName, _baseDataAccessor); PropertyKey.Builder keyBuilder = accessor.keyBuilder(); String root = "/" + clusterName; @@ -1935,8 +1844,7 @@ public void addCustomizedStateConfig(String clusterName, new CustomizedStateConfig.Builder(customizedStateConfig); CustomizedStateConfig customizedStateConfigFromBuilder = builder.build(); - ZKHelixDataAccessor accessor = - new ZKHelixDataAccessor(clusterName, new ZkBaseDataAccessor(_zkClient)); + ZKHelixDataAccessor accessor = new ZKHelixDataAccessor(clusterName, _baseDataAccessor); PropertyKey.Builder keyBuilder = accessor.keyBuilder(); accessor.setProperty(keyBuilder.customizedStateConfig(), customizedStateConfigFromBuilder); @@ -1947,8 +1855,7 @@ public void removeCustomizedStateConfig(String clusterName) { logger.info( "Remove CustomizedStateConfig from cluster {}.", clusterName); - ZKHelixDataAccessor accessor = - new ZKHelixDataAccessor(clusterName, new ZkBaseDataAccessor(_zkClient)); + ZKHelixDataAccessor accessor = new ZKHelixDataAccessor(clusterName, _baseDataAccessor); PropertyKey.Builder keyBuilder = accessor.keyBuilder(); accessor.removeProperty(keyBuilder.customizedStateConfig()); @@ -1967,8 +1874,7 @@ public void addTypeToCustomizedStateConfig(String clusterName, String type) { builder.addAggregationEnabledType(type); CustomizedStateConfig customizedStateConfigFromBuilder = builder.build(); - ZKHelixDataAccessor accessor = - new ZKHelixDataAccessor(clusterName, new ZkBaseDataAccessor(_zkClient)); + ZKHelixDataAccessor accessor = new ZKHelixDataAccessor(clusterName, _baseDataAccessor); PropertyKey.Builder keyBuilder = accessor.keyBuilder(); if(!accessor.updateProperty(keyBuilder.customizedStateConfig(), customizedStateConfigFromBuilder)) { @@ -1997,8 +1903,7 @@ public void removeTypeFromCustomizedStateConfig(String clusterName, String type) builder.removeAggregationEnabledType(type); CustomizedStateConfig customizedStateConfigFromBuilder = builder.build(); - ZKHelixDataAccessor accessor = - new ZKHelixDataAccessor(clusterName, new ZkBaseDataAccessor(_zkClient)); + ZKHelixDataAccessor accessor = new ZKHelixDataAccessor(clusterName, _baseDataAccessor); PropertyKey.Builder keyBuilder = accessor.keyBuilder(); accessor.setProperty(keyBuilder.customizedStateConfig(), customizedStateConfigFromBuilder); @@ -2021,7 +1926,7 @@ public void rebalance(String clusterName, String resourceName, int replica) { @Override public void onDemandRebalance(String clusterName) { - BaseDataAccessor baseAccessor = new ZkBaseDataAccessor(_zkClient); + BaseDataAccessor baseAccessor = _baseDataAccessor; String path = PropertyPathBuilder.clusterConfig(clusterName); if (!baseAccessor.exists(path, 0)) { @@ -2204,7 +2109,7 @@ public void setConstraint(String clusterName, final ConstraintType constraintTyp final String constraintId, final ConstraintItem constraintItem) { logger.info("Set constraint type {} with constraint id {} for cluster {}.", constraintType, constraintId, clusterName); - BaseDataAccessor baseAccessor = new ZkBaseDataAccessor(_zkClient); + BaseDataAccessor baseAccessor = _baseDataAccessor; PropertyKey.Builder keyBuilder = new PropertyKey.Builder(clusterName); String path = keyBuilder.constraint(constraintType.toString()).getPath(); @@ -2227,7 +2132,7 @@ public void removeConstraint(String clusterName, final ConstraintType constraint final String constraintId) { logger.info("Remove constraint type {} with constraint id {} for cluster {}.", constraintType, constraintId, clusterName); - BaseDataAccessor baseAccessor = new ZkBaseDataAccessor(_zkClient); + BaseDataAccessor baseAccessor = _baseDataAccessor; PropertyKey.Builder keyBuilder = new PropertyKey.Builder(clusterName); String path = keyBuilder.constraint(constraintType.toString()).getPath(); @@ -2248,8 +2153,7 @@ public ZNRecord update(ZNRecord currentData) { @Override public ClusterConstraints getConstraints(String clusterName, ConstraintType constraintType) { - HelixDataAccessor accessor = - new ZKHelixDataAccessor(clusterName, new ZkBaseDataAccessor(_zkClient)); + HelixDataAccessor accessor = new ZKHelixDataAccessor(clusterName, _baseDataAccessor); PropertyKey.Builder keyBuilder = new PropertyKey.Builder(clusterName); return accessor.getProperty(keyBuilder.constraint(constraintType.toString())); @@ -2331,8 +2235,7 @@ public void addInstanceTag(String clusterName, String instanceName, String tag) throw new HelixException( "cluster " + clusterName + " instance " + instanceName + " is not setup yet"); } - HelixDataAccessor accessor = - new ZKHelixDataAccessor(clusterName, new ZkBaseDataAccessor(_zkClient)); + HelixDataAccessor accessor = new ZKHelixDataAccessor(clusterName, _baseDataAccessor); PropertyKey.Builder keyBuilder = accessor.keyBuilder(); InstanceConfig config = accessor.getProperty(keyBuilder.instanceConfig(instanceName)); @@ -2352,8 +2255,7 @@ public void removeInstanceTag(String clusterName, String instanceName, String ta throw new HelixException( "cluster " + clusterName + " instance " + instanceName + " is not setup yet"); } - ZKHelixDataAccessor accessor = - new ZKHelixDataAccessor(clusterName, new ZkBaseDataAccessor(_zkClient)); + ZKHelixDataAccessor accessor = new ZKHelixDataAccessor(clusterName, _baseDataAccessor); PropertyKey.Builder keyBuilder = accessor.keyBuilder(); InstanceConfig config = accessor.getProperty(keyBuilder.instanceConfig(instanceName)); @@ -2373,8 +2275,7 @@ public void setInstanceZoneId(String clusterName, String instanceName, String zo throw new HelixException( "cluster " + clusterName + " instance " + instanceName + " is not setup yet"); } - HelixDataAccessor accessor = - new ZKHelixDataAccessor(clusterName, new ZkBaseDataAccessor(_zkClient)); + HelixDataAccessor accessor = new ZKHelixDataAccessor(clusterName, _baseDataAccessor); PropertyKey.Builder keyBuilder = accessor.keyBuilder(); InstanceConfig config = accessor.getProperty(keyBuilder.instanceConfig(instanceName)); @@ -2431,23 +2332,19 @@ public ZNRecord update(ZNRecord currentData) { } InstanceConfig config = new InstanceConfig(currentData); - config.setInstanceEnabled(enabled); - if (!enabled) { - // new disabled type and reason will overwrite existing ones. - config.resetInstanceDisabledTypeAndReason(); - if (reason != null) { - config.setInstanceDisabledReason(reason); - } - if (disabledType != null) { - config.setInstanceDisabledType(disabledType); - } - } + config.setInstanceOperation(new InstanceConfig.InstanceOperation.Builder().setOperation( + enabled ? InstanceConstants.InstanceOperation.ENABLE + : InstanceConstants.InstanceOperation.DISABLE).setReason(reason).setSource( + disabledType != null + ? InstanceConstants.InstanceOperationSource.instanceDisabledTypeToInstanceOperationSource( + disabledType) : null).build()); return config.getRecord(); } }, AccessOption.PERSISTENT); } // TODO: Add history ZNode for all batched enabling/disabling histories with metadata. + @Deprecated private void enableBatchInstances(final String clusterName, final List instances, final boolean enabled, BaseDataAccessor baseAccessor, InstanceConstants.InstanceDisabledType disabledType, String reason) { @@ -2783,8 +2680,7 @@ private Set findTimeoutOfflineInstances(String clusterName, long offline } } - HelixDataAccessor accessor = - new ZKHelixDataAccessor(clusterName, new ZkBaseDataAccessor(_zkClient)); + HelixDataAccessor accessor = new ZKHelixDataAccessor(clusterName, _baseDataAccessor); PropertyKey.Builder keyBuilder = accessor.keyBuilder(); List instanceConfigNames = accessor.getChildNames(keyBuilder.instanceConfigs()); List instancePathNames = accessor.getChildNames(keyBuilder.instances()); diff --git a/helix-core/src/main/java/org/apache/helix/model/InstanceConfig.java b/helix-core/src/main/java/org/apache/helix/model/InstanceConfig.java index de41646c39..1b3acd68d6 100644 --- a/helix-core/src/main/java/org/apache/helix/model/InstanceConfig.java +++ b/helix-core/src/main/java/org/apache/helix/model/InstanceConfig.java @@ -29,6 +29,11 @@ import java.util.Set; import java.util.stream.Collectors; +import javax.annotation.Nullable; + +import com.fasterxml.jackson.core.JsonProcessingException; +import com.fasterxml.jackson.core.type.TypeReference; +import com.fasterxml.jackson.databind.ObjectMapper; import com.google.common.collect.ImmutableSet; import org.apache.helix.HelixException; import org.apache.helix.HelixProperty; @@ -48,11 +53,13 @@ public class InstanceConfig extends HelixProperty { * Configurable characteristics of an instance */ public enum InstanceConfigProperty { - HELIX_HOST, HELIX_PORT, HELIX_ZONE_ID, @Deprecated - HELIX_ENABLED, + HELIX_HOST, + HELIX_PORT, + HELIX_ZONE_ID, + @Deprecated HELIX_ENABLED, HELIX_ENABLED_TIMESTAMP, - HELIX_DISABLED_REASON, - HELIX_DISABLED_TYPE, + @Deprecated HELIX_DISABLED_REASON, + @Deprecated HELIX_DISABLED_TYPE, HELIX_DISABLED_PARTITION, TAG_LIST, INSTANCE_WEIGHT, @@ -60,15 +67,128 @@ public enum InstanceConfigProperty { DELAY_REBALANCE_ENABLED, MAX_CONCURRENT_TASK, INSTANCE_INFO_MAP, - INSTANCE_CAPACITY_MAP, - TARGET_TASK_THREAD_POOL_SIZE, - INSTANCE_OPERATION + INSTANCE_CAPACITY_MAP, TARGET_TASK_THREAD_POOL_SIZE, HELIX_INSTANCE_OPERATIONS + } + + public static class InstanceOperation { + private final Map _properties; + + private enum InstanceOperationProperties { + OPERATION, REASON, SOURCE, TIMESTAMP + } + + private InstanceOperation(@Nullable Map properties) { + // Default to ENABLE operation if no operation type is provided. + _properties = properties == null ? new HashMap<>() : properties; + if (!_properties.containsKey(InstanceOperationProperties.OPERATION.name())) { + _properties.put(InstanceOperationProperties.OPERATION.name(), + InstanceConstants.InstanceOperation.ENABLE.name()); + } + } + + public static class Builder { + private Map _properties = new HashMap<>(); + + /** + * Set the operation type for this instance operation. + * @param operationType InstanceOperation type of this instance operation. + */ + public Builder setOperation(@Nullable InstanceConstants.InstanceOperation operationType) { + _properties.put(InstanceOperationProperties.OPERATION.name(), + operationType == null ? InstanceConstants.InstanceOperation.ENABLE.name() + : operationType.name()); + return this; + } + + /** + * Set the reason for this instance operation. + * @param reason + */ + public Builder setReason(String reason) { + _properties.put(InstanceOperationProperties.REASON.name(), reason != null ? reason : ""); + return this; + } + + /** + * Set the source for this instance operation. + * @param source InstanceOperationSource + * that caused this instance operation to be triggered. + */ + public Builder setSource(InstanceConstants.InstanceOperationSource source) { + _properties.put(InstanceOperationProperties.SOURCE.name(), + source == null ? InstanceConstants.InstanceOperationSource.USER.name() + : source.name()); + return this; + } + + public InstanceOperation build() throws IllegalArgumentException { + if (!_properties.containsKey(InstanceOperationProperties.OPERATION.name())) { + throw new IllegalArgumentException( + "Instance operation type is not set, this is a required field."); + } + _properties.put(InstanceOperationProperties.TIMESTAMP.name(), + String.valueOf(System.currentTimeMillis())); + return new InstanceOperation(_properties); + } + } + + /** + * Get the operation type of this instance operation. + * @return the InstanceOperation type + */ + public InstanceConstants.InstanceOperation getOperation() throws IllegalArgumentException { + return InstanceConstants.InstanceOperation.valueOf( + _properties.get(InstanceOperationProperties.OPERATION.name())); + } + + /** + * Get the reason for this instance operation. + * If the reason is not set, it will default to an empty string. + * + * @return the reason for this instance operation. + */ + public String getReason() { + return _properties.getOrDefault(InstanceOperationProperties.REASON.name(), ""); + } + + /** + * Get the InstanceOperationSource + * that caused this instance operation to be triggered. + * If the source is not set, it will default to DEFAULT. + * + * @return the InstanceOperationSource + *that caused this instance operation to be triggered. + */ + public InstanceConstants.InstanceOperationSource getSource() { + return InstanceConstants.InstanceOperationSource.valueOf( + _properties.getOrDefault(InstanceOperationProperties.SOURCE.name(), + InstanceConstants.InstanceOperationSource.USER.name())); + } + + /** + * Get the timestamp (milliseconds from epoch) when this instance operation was triggered. + * + * @return the timestamp when the instance operation was triggered. + */ + public long getTimestamp() { + return Long.parseLong(_properties.get(InstanceOperationProperties.TIMESTAMP.name())); + } + + private void setTimestamp(long timestamp) { + _properties.put(InstanceOperationProperties.TIMESTAMP.name(), String.valueOf(timestamp)); + } + + private Map getProperties() { + return _properties; + } } public static final int WEIGHT_NOT_SET = -1; public static final int MAX_CONCURRENT_TASK_NOT_SET = -1; private static final int TARGET_TASK_THREAD_POOL_SIZE_NOT_SET = -1; private static final boolean HELIX_ENABLED_DEFAULT_VALUE = true; + private static final long HELIX_ENABLED_TIMESTAMP_DEFAULT_VALUE = -1; + private static final ObjectMapper _objectMapper = new ObjectMapper(); // These fields are not allowed to be overwritten by the merge method because // they are unique properties of an instance. @@ -79,6 +199,8 @@ public enum InstanceConfigProperty { private static final Logger _logger = LoggerFactory.getLogger(InstanceConfig.class.getName()); + private List _deserializedInstanceOperations; + /** * Instantiate for a specific instance * @param instanceId the instance identifier @@ -264,25 +386,28 @@ public boolean containsTag(String tag) { * enabled/disabled, return -1. */ public long getInstanceEnabledTime() { - return _record.getLongField(InstanceConfigProperty.HELIX_ENABLED_TIMESTAMP.name(), -1); + return _record.getLongField(InstanceConfigProperty.HELIX_ENABLED_TIMESTAMP.name(), + HELIX_ENABLED_TIMESTAMP_DEFAULT_VALUE); } /** * Set the enabled state of the instance If user enables the instance, HELIX_DISABLED_REASON filed * will be removed. - * @deprecated This method is deprecated. Please use setInstanceOperation instead. * @param enabled true to enable, false to disable + * @deprecated This method is deprecated. Please use setInstanceOperation instead. */ @Deprecated public void setInstanceEnabled(boolean enabled) { // set instance operation only when we need to change InstanceEnabled value. - setInstanceEnabledHelper(enabled); + setInstanceEnabledHelper(enabled, null); } - private void setInstanceEnabledHelper(boolean enabled) { - _record.setBooleanField(InstanceConfigProperty.HELIX_ENABLED.toString(), enabled); - _record.setLongField(InstanceConfigProperty.HELIX_ENABLED_TIMESTAMP.name(), System.currentTimeMillis()); + private void setInstanceEnabledHelper(boolean enabled, Long timestampOverride) { + _record.setBooleanField(InstanceConfigProperty.HELIX_ENABLED.name(), enabled); + _record.setLongField(InstanceConfigProperty.HELIX_ENABLED_TIMESTAMP.name(), + timestampOverride != null ? timestampOverride : System.currentTimeMillis()); if (enabled) { + // TODO: Replace this when HELIX_ENABLED and HELIX_DISABLED_REASON is removed. resetInstanceDisabledTypeAndReason(); } } @@ -290,6 +415,7 @@ private void setInstanceEnabledHelper(boolean enabled) { /** * Removes HELIX_DISABLED_REASON and HELIX_DISABLED_TYPE entry from simple field. */ + @Deprecated public void resetInstanceDisabledTypeAndReason() { _record.getSimpleFields().remove(InstanceConfigProperty.HELIX_DISABLED_REASON.name()); _record.getSimpleFields().remove(InstanceConfigProperty.HELIX_DISABLED_TYPE.name()); @@ -298,19 +424,25 @@ public void resetInstanceDisabledTypeAndReason() { /** * Set the instance disabled reason when instance is disabled. * It will be a no-op when instance is enabled. + * @deprecated This method is deprecated. Please use . */ + @Deprecated public void setInstanceDisabledReason(String disabledReason) { - if (getInstanceOperation().equals(InstanceConstants.InstanceOperation.DISABLE)) { - _record.setSimpleField(InstanceConfigProperty.HELIX_DISABLED_REASON.name(), disabledReason); - } + if (getInstanceOperation().getOperation().equals(InstanceConstants.InstanceOperation.DISABLE)) { + _record.setSimpleField(InstanceConfigProperty.HELIX_DISABLED_REASON.name(), disabledReason); + } } /** * Set the instance disabled type when instance is disabled. * It will be a no-op when instance is enabled. + * @deprecated This method is deprecated. Please use setInstanceOperation along with + * InstanceOperation.Builder().setSource + *(...) */ + @Deprecated public void setInstanceDisabledType(InstanceConstants.InstanceDisabledType disabledType) { - if (getInstanceOperation().equals(InstanceConstants.InstanceOperation.DISABLE)) { + if (getInstanceOperation().getOperation().equals(InstanceConstants.InstanceOperation.DISABLE)) { _record.setSimpleField(InstanceConfigProperty.HELIX_DISABLED_TYPE.name(), disabledType.name()); } @@ -319,7 +451,9 @@ public void setInstanceDisabledType(InstanceConstants.InstanceDisabledType disab /** * Get the instance disabled reason when instance is disabled. * @return Return instance disabled reason. Default is am empty string. + * @deprecated This method is deprecated. Please use getInstanceOperation().getReason() instead. */ + @Deprecated public String getInstanceDisabledReason() { return _record.getStringField(InstanceConfigProperty.HELIX_DISABLED_REASON.name(), ""); } @@ -328,63 +462,184 @@ public String getInstanceDisabledReason() { * * @return Return instance disabled type (org.apache.helix.constants.InstanceConstants.InstanceDisabledType) * Default is am empty string. + * @deprecated This method is deprecated. Please use getInstanceOperation().getSource + *() instead. */ + @Deprecated public String getInstanceDisabledType() { - if (!getInstanceOperation().equals(InstanceConstants.InstanceOperation.DISABLE)) { + if (_record.getBooleanField(InstanceConfigProperty.HELIX_ENABLED.name(), + HELIX_ENABLED_DEFAULT_VALUE)) { return InstanceConstants.INSTANCE_NOT_DISABLED; } return _record.getStringField(InstanceConfigProperty.HELIX_DISABLED_TYPE.name(), InstanceConstants.InstanceDisabledType.DEFAULT_INSTANCE_DISABLE_TYPE.name()); } + private List getInstanceOperations() { + if (_deserializedInstanceOperations == null || _deserializedInstanceOperations.isEmpty()) { + // If the _deserializedInstanceOperations is not set, then we need to build it from the real + // helix property HELIX_INSTANCE_OPERATIONS. + List instanceOperations = + _record.getListField(InstanceConfigProperty.HELIX_INSTANCE_OPERATIONS.name()); + List newDeserializedInstanceOperations = new ArrayList<>(); + if (instanceOperations != null) { + for (String serializedInstanceOperation : instanceOperations) { + try { + Map properties = _objectMapper.readValue(serializedInstanceOperation, + new TypeReference>() { + }); + newDeserializedInstanceOperations.add(new InstanceOperation(properties)); + } catch (JsonProcessingException e) { + _logger.error( + "Failed to deserialize instance operation for instance: " + _record.getId(), e); + } + } + } + _deserializedInstanceOperations = newDeserializedInstanceOperations; + } + + return _deserializedInstanceOperations; + } + /** * Set the instance operation for this instance. + * This method also sets the HELIX_ENABLED, HELIX_DISABLED_REASON, and HELIX_DISABLED_TYPE fields + * for backwards compatibility. * * @param operation the instance operation */ - public void setInstanceOperation(InstanceConstants.InstanceOperation operation) { - _record.setSimpleField(InstanceConfigProperty.INSTANCE_OPERATION.name(), - operation == null ? "" : operation.name()); - if (operation == null || operation == InstanceConstants.InstanceOperation.ENABLE - || operation == InstanceConstants.InstanceOperation.DISABLE) { + public void setInstanceOperation(InstanceOperation operation) { + List deserializedInstanceOperations = getInstanceOperations(); + + if (operation.getSource() == InstanceConstants.InstanceOperationSource.ADMIN) { + deserializedInstanceOperations.clear(); + } else { + // Remove the instance operation with the same source if it exists. + deserializedInstanceOperations.removeIf( + instanceOperation -> instanceOperation.getSource() == operation.getSource()); + } + if (operation.getOperation() == InstanceConstants.InstanceOperation.ENABLE) { + // Insert the operation after the last ENABLE or at the beginning if there isn't ENABLE in the list. + int insertIndex = 0; + for (int i = deserializedInstanceOperations.size() - 1; i >= 0; i--) { + if (deserializedInstanceOperations.get(i).getOperation() + == InstanceConstants.InstanceOperation.ENABLE) { + insertIndex = i + 1; + break; + } + } + deserializedInstanceOperations.add(insertIndex, operation); + } else { + deserializedInstanceOperations.add(operation); + } + // Set the actual field in the ZnRecord + _record.setListField(InstanceConfigProperty.HELIX_INSTANCE_OPERATIONS.name(), + deserializedInstanceOperations.stream().map(instanceOperation -> { + try { + return _objectMapper.writeValueAsString(instanceOperation.getProperties()); + } catch (JsonProcessingException e) { + throw new HelixException( + "Failed to serialize instance operation for instance: " + _record.getId() + + " Can't set the instance operation to: " + operation.getOperation(), e); + } + }).collect(Collectors.toList())); + + // TODO: Remove this when we are sure that all users are using the new InstanceOperation only and HELIX_ENABLED is removed. + if (operation.getOperation() == InstanceConstants.InstanceOperation.DISABLE) { // We are still setting the HELIX_ENABLED field for backwards compatibility. // It is possible that users will be using earlier version of HelixAdmin or helix-rest // is on older version. - // TODO: Remove this when we are sure that all users are using the new field INSTANCE_OPERATION. - setInstanceEnabledHelper(!(operation == InstanceConstants.InstanceOperation.DISABLE)); + + if (_record.getBooleanField(InstanceConfigProperty.HELIX_ENABLED.name(), true)) { + // Check if it is already disabled, if yes, then we don't need to set HELIX_ENABLED and HELIX_ENABLED_TIMESTAMP + setInstanceEnabledHelper(false, operation.getTimestamp()); + } + + _record.setSimpleField(InstanceConfigProperty.HELIX_DISABLED_REASON.name(), + operation.getReason()); + _record.setSimpleField(InstanceConfigProperty.HELIX_DISABLED_TYPE.name(), + InstanceConstants.InstanceDisabledType.DEFAULT_INSTANCE_DISABLE_TYPE.name()); + } else if (operation.getOperation() == InstanceConstants.InstanceOperation.ENABLE) { + // If any of the other InstanceOperations are of type DISABLE, set that in the HELIX_ENABLED, + // HELIX_DISABLED_REASON, and HELIX_DISABLED_TYPE fields. + InstanceOperation latestDisableInstanceOperation = null; + for (InstanceOperation instanceOperation : getInstanceOperations()) { + if (instanceOperation.getOperation() == InstanceConstants.InstanceOperation.DISABLE && ( + latestDisableInstanceOperation == null || instanceOperation.getTimestamp() + > latestDisableInstanceOperation.getTimestamp())) { + latestDisableInstanceOperation = instanceOperation; + } + } + + if (latestDisableInstanceOperation != null) { + _record.setSimpleField(InstanceConfigProperty.HELIX_DISABLED_REASON.name(), + latestDisableInstanceOperation.getReason()); + _record.setSimpleField(InstanceConfigProperty.HELIX_DISABLED_TYPE.name(), + InstanceConstants.InstanceDisabledType.DEFAULT_INSTANCE_DISABLE_TYPE.name()); + } else { + setInstanceEnabledHelper(true, operation.getTimestamp()); + } } } + /** + * Set the instance operation for this instance. Provide the InstanceOperation enum and the reason + * and source will be set to default values. + * + * @param operation the instance operation + */ + public void setInstanceOperation(InstanceConstants.InstanceOperation operation) { + InstanceOperation instanceOperation = + new InstanceOperation.Builder().setOperation(operation).build(); + setInstanceOperation(instanceOperation); + } + private void setInstanceOperationInit(InstanceConstants.InstanceOperation operation) { if (operation == null) { return; } - _record.setSimpleField(InstanceConfigProperty.INSTANCE_OPERATION.name(), operation.name()); + InstanceOperation instanceOperation = + new InstanceOperation.Builder().setOperation(operation).setReason("INIT").build(); + // When an instance is created for the first time the timestamp is set to -1 so that if it + // is disabled it will not be considered within the delay window when it joins. + instanceOperation.setTimestamp(HELIX_ENABLED_TIMESTAMP_DEFAULT_VALUE); + setInstanceOperation(instanceOperation); + } + + private InstanceOperation getActiveInstanceOperation() { + List instanceOperations = getInstanceOperations(); + + if (instanceOperations.isEmpty()) { + InstanceOperation instanceOperation = + new InstanceOperation.Builder().setOperation(InstanceConstants.InstanceOperation.ENABLE) + .setSource(InstanceConstants.InstanceOperationSource.DEFAULT).build(); + instanceOperation.setTimestamp(HELIX_ENABLED_TIMESTAMP_DEFAULT_VALUE); + return instanceOperation; + } + + // The last instance operation in the list is the most recent one. + // ENABLE operation should not be included in the list. + return instanceOperations.get(instanceOperations.size() - 1); } /** - * Get the InstanceOperation of this instance, default is ENABLE if nothing is set. If + * Get the InstanceOperationType of this instance, default is ENABLE if nothing is set. If * HELIX_ENABLED is set to false, then the instance operation is DISABLE for backwards * compatibility. * * @return the instance operation */ - public InstanceConstants.InstanceOperation getInstanceOperation() { - String instanceOperationString = - _record.getSimpleField(InstanceConfigProperty.INSTANCE_OPERATION.name()); - - InstanceConstants.InstanceOperation instanceOperation; + public InstanceOperation getInstanceOperation() { + InstanceOperation activeInstanceOperation = getActiveInstanceOperation(); try { - // If INSTANCE_OPERATION is not set, then the instance is enabled. - instanceOperation = (instanceOperationString == null || instanceOperationString.isEmpty()) - ? InstanceConstants.InstanceOperation.ENABLE - : InstanceConstants.InstanceOperation.valueOf(instanceOperationString); + activeInstanceOperation.getOperation(); } catch (IllegalArgumentException e) { - _logger.error("Invalid instance operation: " + instanceOperationString + " for instance: " - + _record.getId() + _logger.error("Invalid instance operation type for instance: " + _record.getId() + ". You may need to update your version of Helix to get support for this " + "type of InstanceOperation. Defaulting to UNKNOWN."); - return InstanceConstants.InstanceOperation.UNKNOWN; + activeInstanceOperation = + new InstanceOperation.Builder().setOperation(InstanceConstants.InstanceOperation.UNKNOWN) + .build(); } // Always respect the HELIX_ENABLED being set to false when instance operation is unset @@ -392,11 +647,16 @@ public InstanceConstants.InstanceOperation getInstanceOperation() { if (!_record.getBooleanField(InstanceConfigProperty.HELIX_ENABLED.name(), HELIX_ENABLED_DEFAULT_VALUE) && (InstanceConstants.INSTANCE_DISABLED_OVERRIDABLE_OPERATIONS.contains( - instanceOperation))) { - return InstanceConstants.InstanceOperation.DISABLE; + activeInstanceOperation.getOperation()))) { + return new InstanceOperation.Builder().setOperation( + InstanceConstants.InstanceOperation.DISABLE).setReason(getInstanceDisabledReason()) + .setSource( + InstanceConstants.InstanceOperationSource.instanceDisabledTypeToInstanceOperationSource( + InstanceConstants.InstanceDisabledType.valueOf(getInstanceDisabledType()))) + .build(); } - return instanceOperation; + return activeInstanceOperation; } /** @@ -406,7 +666,7 @@ public InstanceConstants.InstanceOperation getInstanceOperation() { * @return true if enabled, false otherwise */ public boolean getInstanceEnabled() { - return getInstanceOperation().equals(InstanceConstants.InstanceOperation.ENABLE); + return getInstanceOperation().getOperation().equals(InstanceConstants.InstanceOperation.ENABLE); } /** @@ -416,7 +676,8 @@ public boolean getInstanceEnabled() { * @return true if the instance is assignable, false otherwise */ public boolean isAssignable() { - return InstanceConstants.ASSIGNABLE_INSTANCE_OPERATIONS.contains(getInstanceOperation()); + return InstanceConstants.ASSIGNABLE_INSTANCE_OPERATIONS.contains( + getInstanceOperation().getOperation()); } /** @@ -929,10 +1190,8 @@ public InstanceConfig build(String instanceId) { instanceConfig.addTag(tag); } - if (_instanceOperation == null && _instanceEnabled != HELIX_ENABLED_DEFAULT_VALUE) { - instanceConfig.setInstanceOperationInit( - _instanceEnabled ? InstanceConstants.InstanceOperation.ENABLE - : InstanceConstants.InstanceOperation.DISABLE); + if (_instanceOperation == null && !_instanceEnabled) { + instanceConfig.setInstanceOperationInit(InstanceConstants.InstanceOperation.DISABLE); } if (_instanceOperation != null && !_instanceOperation.equals( diff --git a/helix-core/src/main/java/org/apache/helix/spectator/RoutingDataCache.java b/helix-core/src/main/java/org/apache/helix/spectator/RoutingDataCache.java index db9ada93c4..f634aac46e 100644 --- a/helix-core/src/main/java/org/apache/helix/spectator/RoutingDataCache.java +++ b/helix-core/src/main/java/org/apache/helix/spectator/RoutingDataCache.java @@ -181,8 +181,8 @@ public synchronized void refresh(HelixDataAccessor accessor) { private void updateRoutableInstanceConfigMap(Map instanceConfigMap) { _routableInstanceConfigMap = instanceConfigMap.entrySet().stream().filter( - (instanceConfigEntry) -> !InstanceConstants.UNSERVABLE_INSTANCE_OPERATIONS.contains( - instanceConfigEntry.getValue().getInstanceOperation())) + (instanceConfigEntry) -> !InstanceConstants.UNROUTABLE_INSTANCE_OPERATIONS.contains( + instanceConfigEntry.getValue().getInstanceOperation().getOperation())) .collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue)); } @@ -190,8 +190,9 @@ private void updateRoutableLiveInstanceMap(Map instanceC Map liveInstanceMap) { _routableLiveInstanceMap = liveInstanceMap.entrySet().stream().filter( (liveInstanceEntry) -> instanceConfigMap.containsKey(liveInstanceEntry.getKey()) - && !InstanceConstants.UNSERVABLE_INSTANCE_OPERATIONS.contains( - instanceConfigMap.get(liveInstanceEntry.getKey()).getInstanceOperation())) + && !InstanceConstants.UNROUTABLE_INSTANCE_OPERATIONS.contains( + instanceConfigMap.get(liveInstanceEntry.getKey()).getInstanceOperation() + .getOperation())) .collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue)); } diff --git a/helix-core/src/main/java/org/apache/helix/util/InstanceUtil.java b/helix-core/src/main/java/org/apache/helix/util/InstanceUtil.java new file mode 100644 index 0000000000..967d561e74 --- /dev/null +++ b/helix-core/src/main/java/org/apache/helix/util/InstanceUtil.java @@ -0,0 +1,198 @@ +package org.apache.helix.util; + +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +import java.util.List; +import java.util.Map; +import java.util.function.Function; +import java.util.stream.Collectors; + +import com.google.common.collect.ImmutableMap; +import org.apache.helix.AccessOption; +import org.apache.helix.BaseDataAccessor; +import org.apache.helix.ConfigAccessor; +import org.apache.helix.HelixException; +import org.apache.helix.PropertyPathBuilder; +import org.apache.helix.constants.InstanceConstants; +import org.apache.helix.model.ClusterTopologyConfig; +import org.apache.helix.model.HelixConfigScope; +import org.apache.helix.model.InstanceConfig; +import org.apache.helix.model.builder.HelixConfigScopeBuilder; +import org.apache.helix.zookeeper.datamodel.ZNRecord; +import org.apache.helix.zookeeper.zkclient.DataUpdater; + +public class InstanceUtil { + + // Private constructor to prevent instantiation + private InstanceUtil() { + } + + // Validators for instance operation transitions + private static final Function, Boolean> ALWAYS_ALLOWED = + (matchingInstances) -> true; + private static final Function, Boolean> ALL_MATCHES_ARE_UNKNOWN = + (matchingInstances) -> matchingInstances.isEmpty() || matchingInstances.stream().allMatch( + instance -> instance.getInstanceOperation().getOperation() + .equals(InstanceConstants.InstanceOperation.UNKNOWN)); + private static final Function, Boolean> ALL_MATCHES_ARE_UNKNOWN_OR_EVACUATE = + (matchingInstances) -> matchingInstances.isEmpty() || matchingInstances.stream().allMatch( + instance -> instance.getInstanceOperation().getOperation() + .equals(InstanceConstants.InstanceOperation.UNKNOWN) + || instance.getInstanceOperation().getOperation() + .equals(InstanceConstants.InstanceOperation.EVACUATE)); + private static final Function, Boolean> ANY_MATCH_ENABLE_OR_DISABLE = + (matchingInstances) -> !matchingInstances.isEmpty() && matchingInstances.stream().anyMatch( + instance -> instance.getInstanceOperation().getOperation() + .equals(InstanceConstants.InstanceOperation.ENABLE) || instance.getInstanceOperation() + .getOperation().equals(InstanceConstants.InstanceOperation.DISABLE)); + + // Validator map for valid instance operation transitions :: + private static final ImmutableMap, Boolean>>> + validInstanceOperationTransitions = + ImmutableMap.of(InstanceConstants.InstanceOperation.ENABLE, + // ENABLE and DISABLE can be set to UNKNOWN when matching instance is in SWAP_IN and set to ENABLE in a transaction. + ImmutableMap.of(InstanceConstants.InstanceOperation.ENABLE, ALWAYS_ALLOWED, + InstanceConstants.InstanceOperation.DISABLE, ALWAYS_ALLOWED, + InstanceConstants.InstanceOperation.EVACUATE, ALWAYS_ALLOWED, + InstanceConstants.InstanceOperation.UNKNOWN, ALWAYS_ALLOWED), + InstanceConstants.InstanceOperation.DISABLE, + ImmutableMap.of(InstanceConstants.InstanceOperation.DISABLE, ALWAYS_ALLOWED, + InstanceConstants.InstanceOperation.ENABLE, ALWAYS_ALLOWED, + InstanceConstants.InstanceOperation.EVACUATE, ALWAYS_ALLOWED, + InstanceConstants.InstanceOperation.UNKNOWN, ALWAYS_ALLOWED), + InstanceConstants.InstanceOperation.SWAP_IN, + // SWAP_IN can be set to ENABLE when matching instance is in UNKNOWN state in a transaction. + ImmutableMap.of(InstanceConstants.InstanceOperation.SWAP_IN, ALWAYS_ALLOWED, + InstanceConstants.InstanceOperation.UNKNOWN, ALWAYS_ALLOWED), + InstanceConstants.InstanceOperation.EVACUATE, + ImmutableMap.of(InstanceConstants.InstanceOperation.EVACUATE, ALWAYS_ALLOWED, + InstanceConstants.InstanceOperation.ENABLE, ALL_MATCHES_ARE_UNKNOWN, + InstanceConstants.InstanceOperation.DISABLE, ALL_MATCHES_ARE_UNKNOWN, + InstanceConstants.InstanceOperation.UNKNOWN, ALWAYS_ALLOWED), + InstanceConstants.InstanceOperation.UNKNOWN, + ImmutableMap.of(InstanceConstants.InstanceOperation.UNKNOWN, ALWAYS_ALLOWED, + InstanceConstants.InstanceOperation.ENABLE, ALL_MATCHES_ARE_UNKNOWN_OR_EVACUATE, + InstanceConstants.InstanceOperation.DISABLE, ALL_MATCHES_ARE_UNKNOWN_OR_EVACUATE, + InstanceConstants.InstanceOperation.SWAP_IN, ANY_MATCH_ENABLE_OR_DISABLE)); + + /** + * Validates if the transition from the current operation to the target operation is valid. + * + * @param configAccessor The ConfigAccessor instance + * @param clusterName The cluster name + * @param instanceConfig The current instance configuration + * @param currentOperation The current operation + * @param targetOperation The target operation + */ + public static void validateInstanceOperationTransition(ConfigAccessor configAccessor, + String clusterName, InstanceConfig instanceConfig, + InstanceConstants.InstanceOperation currentOperation, + InstanceConstants.InstanceOperation targetOperation) { + // Check if the current operation and target operation are in the valid transitions map + if (!validInstanceOperationTransitions.containsKey(currentOperation) + || !validInstanceOperationTransitions.get(currentOperation).containsKey(targetOperation)) { + throw new HelixException( + "Invalid instance operation transition from " + currentOperation + " to " + + targetOperation); + } + + // Throw exception if the validation fails + if (!validInstanceOperationTransitions.get(currentOperation).get(targetOperation) + .apply(findInstancesWithMatchingLogicalId(configAccessor, clusterName, instanceConfig))) { + throw new HelixException( + "Failed validation for instance operation transition from " + currentOperation + " to " + + targetOperation); + } + } + + /** + * Finds the instances that have a matching logical ID with the given instance. + * + * @param configAccessor The ConfigAccessor instance + * @param clusterName The cluster name + * @param instanceConfig The instance configuration to match + * @return A list of matching instances + */ + public static List findInstancesWithMatchingLogicalId( + ConfigAccessor configAccessor, String clusterName, InstanceConfig instanceConfig) { + String logicalIdKey = + ClusterTopologyConfig.createFromClusterConfig(configAccessor.getClusterConfig(clusterName)) + .getEndNodeType(); + + // Retrieve and filter instances with matching logical ID + return configAccessor.getKeys( + new HelixConfigScopeBuilder(HelixConfigScope.ConfigScopeProperty.PARTICIPANT, + clusterName).build()).stream() + .map(instanceName -> configAccessor.getInstanceConfig(clusterName, instanceName)).filter( + potentialInstanceConfig -> + !potentialInstanceConfig.getInstanceName().equals(instanceConfig.getInstanceName()) + && potentialInstanceConfig.getLogicalId(logicalIdKey) + .equals(instanceConfig.getLogicalId(logicalIdKey))) + .collect(Collectors.toList()); + } + + /** + * Sets the instance operation for the given instance. + * + * @param configAccessor The ConfigAccessor instance + * @param baseAccessor The BaseDataAccessor instance + * @param clusterName The cluster name + * @param instanceName The instance name + * @param instanceOperation The instance operation to set + */ + public static void setInstanceOperation(ConfigAccessor configAccessor, + BaseDataAccessor baseAccessor, String clusterName, String instanceName, + InstanceConfig.InstanceOperation instanceOperation) { + String path = PropertyPathBuilder.instanceConfig(clusterName, instanceName); + + // Retrieve the current instance configuration + InstanceConfig instanceConfig = configAccessor.getInstanceConfig(clusterName, instanceName); + if (instanceConfig == null) { + throw new HelixException("Cluster " + clusterName + ", instance: " + instanceName + + ", instance config does not exist"); + } + + // Validate the instance operation transition + validateInstanceOperationTransition(configAccessor, clusterName, instanceConfig, + instanceConfig.getInstanceOperation().getOperation(), + instanceOperation == null ? InstanceConstants.InstanceOperation.ENABLE + : instanceOperation.getOperation()); + + // Update the instance operation + boolean succeeded = baseAccessor.update(path, new DataUpdater() { + @Override + public ZNRecord update(ZNRecord currentData) { + if (currentData == null) { + throw new HelixException("Cluster: " + clusterName + ", instance: " + instanceName + + ", participant config is null"); + } + + InstanceConfig config = new InstanceConfig(currentData); + config.setInstanceOperation(instanceOperation); + return config.getRecord(); + } + }, AccessOption.PERSISTENT); + + if (!succeeded) { + throw new HelixException( + "Failed to update instance operation. Please check if instance is disabled."); + } + } +} diff --git a/helix-core/src/test/java/org/apache/helix/cloud/event/TestDefaultCloudEventCallbackImpl.java b/helix-core/src/test/java/org/apache/helix/cloud/event/TestDefaultCloudEventCallbackImpl.java index bf9b59dc2d..5ea42dc3e7 100644 --- a/helix-core/src/test/java/org/apache/helix/cloud/event/TestDefaultCloudEventCallbackImpl.java +++ b/helix-core/src/test/java/org/apache/helix/cloud/event/TestDefaultCloudEventCallbackImpl.java @@ -52,20 +52,18 @@ public void testDisableInstance() { Assert.assertFalse(InstanceValidationUtil .isEnabled(_manager.getHelixDataAccessor(), _instanceManager.getInstanceName())); Assert.assertEquals(_manager.getConfigAccessor() - .getInstanceConfig(CLUSTER_NAME, _instanceManager.getInstanceName()) - .getInstanceDisabledType(), InstanceConstants.InstanceDisabledType.CLOUD_EVENT.name()); + .getInstanceConfig(CLUSTER_NAME, _instanceManager.getInstanceName()).getInstanceOperation() + .getSource(), InstanceConstants.InstanceOperationSource.AUTOMATION); - // Should not disable instance if it is already disabled due to other reasons - // And disabled type should remain unchanged _admin.enableInstance(CLUSTER_NAME, _instanceManager.getInstanceName(), false); _impl.disableInstance(_instanceManager, null); Assert.assertFalse(InstanceValidationUtil .isEnabled(_manager.getHelixDataAccessor(), _instanceManager.getInstanceName())); Assert.assertEquals(_manager.getConfigAccessor() .getInstanceConfig(CLUSTER_NAME, _instanceManager.getInstanceName()) - .getInstanceDisabledType(), - InstanceConstants.InstanceDisabledType.DEFAULT_INSTANCE_DISABLE_TYPE.name()); + .getInstanceOperation().getSource(), InstanceConstants.InstanceOperationSource.USER); + _admin.enableInstance(CLUSTER_NAME, _instanceManager.getInstanceName(), true); _admin.enableInstance(CLUSTER_NAME, _instanceManager.getInstanceName(), false, InstanceConstants.InstanceDisabledType.CLOUD_EVENT, null); } diff --git a/helix-core/src/test/java/org/apache/helix/integration/rebalancer/TestInstanceOperation.java b/helix-core/src/test/java/org/apache/helix/integration/rebalancer/TestInstanceOperation.java index 85600c01c1..67b575f0c0 100644 --- a/helix-core/src/test/java/org/apache/helix/integration/rebalancer/TestInstanceOperation.java +++ b/helix-core/src/test/java/org/apache/helix/integration/rebalancer/TestInstanceOperation.java @@ -211,7 +211,7 @@ private void removeOfflineOrInactiveInstances() { InstanceConfig instanceConfig = _gSetupTool.getClusterManagementTool().getInstanceConfig(CLUSTER_NAME, participantName); if (!_participants.get(i).isConnected() || !instanceConfig.getInstanceEnabled() - || instanceConfig.getInstanceOperation() + || instanceConfig.getInstanceOperation().getOperation() .equals(InstanceConstants.InstanceOperation.SWAP_IN)) { if (_participants.get(i).isConnected()) { _participants.get(i).syncStop(); @@ -338,7 +338,7 @@ public void testAddingNodeWithEvacuationTag() throws Exception { // now remove operation tag String instanceToEvacuate = _participants.get(0).getInstanceName(); _gSetupTool.getClusterManagementTool() - .setInstanceOperation(CLUSTER_NAME, instanceToEvacuate, null); + .setInstanceOperation(CLUSTER_NAME, instanceToEvacuate, InstanceConstants.InstanceOperation.ENABLE); Assert.assertTrue(_clusterVerifier.verifyByPolling()); @@ -370,7 +370,8 @@ public void testNodeSwapNoTopologySetup() throws Exception { Assert.assertEquals( _gSetupTool.getClusterManagementTool().getInstanceConfig(CLUSTER_NAME, instanceToSwapInName) - .getInstanceOperation(), InstanceConstants.InstanceOperation.UNKNOWN); + .getInstanceOperation().getOperation(), + InstanceConstants.InstanceOperation.UNKNOWN); } @Test(dependsOnMethods = "testNodeSwapNoTopologySetup") @@ -397,7 +398,8 @@ public void testAddingNodeWithEnableInstanceOperation() throws Exception { Assert.assertEquals( _gSetupTool.getClusterManagementTool().getInstanceConfig(CLUSTER_NAME, instanceToSwapInName) - .getInstanceOperation(), InstanceConstants.InstanceOperation.UNKNOWN); + .getInstanceOperation().getOperation(), + InstanceConstants.InstanceOperation.UNKNOWN); } @Test(dependsOnMethods = "testAddingNodeWithEnableInstanceOperation") @@ -416,7 +418,8 @@ public void testNodeSwapWithNoSwapOutNode() throws Exception { Assert.assertEquals( _gSetupTool.getClusterManagementTool().getInstanceConfig(CLUSTER_NAME, instanceToSwapInName) - .getInstanceOperation(), InstanceConstants.InstanceOperation.UNKNOWN); + .getInstanceOperation().getOperation(), + InstanceConstants.InstanceOperation.UNKNOWN); } @Test(dependsOnMethods = "testNodeSwapWithNoSwapOutNode") @@ -440,7 +443,8 @@ public void testNodeSwapSwapInNodeNoInstanceOperationEnabled() throws Exception Assert.assertEquals( _gSetupTool.getClusterManagementTool().getInstanceConfig(CLUSTER_NAME, instanceToSwapInName) - .getInstanceOperation(), InstanceConstants.InstanceOperation.UNKNOWN); + .getInstanceOperation().getOperation(), + InstanceConstants.InstanceOperation.UNKNOWN); // Setting the InstanceOperation to SWAP_IN should work because there is a matching logicalId in // the cluster and the InstanceCapacityWeights and FaultZone match. @@ -481,7 +485,8 @@ public void testNodeSwapSwapInNodeWithAlreadySwappingPair() throws Exception { // Instance should be UNKNOWN since there was already a swapping pair. Assert.assertEquals(_gSetupTool.getClusterManagementTool() - .getInstanceConfig(CLUSTER_NAME, secondInstanceToSwapInName).getInstanceOperation(), + .getInstanceConfig(CLUSTER_NAME, secondInstanceToSwapInName).getInstanceOperation() + .getOperation(), InstanceConstants.InstanceOperation.UNKNOWN); // Try to set the InstanceOperation to SWAP_IN, it should throw an exception since there is already @@ -576,7 +581,8 @@ public void testNodeSwap() throws Exception { Assert.assertFalse(_gSetupTool.getClusterManagementTool() .getInstanceConfig(CLUSTER_NAME, instanceToSwapOutName).getInstanceEnabled()); Assert.assertEquals(_gSetupTool.getClusterManagementTool() - .getInstanceConfig(CLUSTER_NAME, instanceToSwapOutName).getInstanceOperation(), + .getInstanceConfig(CLUSTER_NAME, instanceToSwapOutName).getInstanceOperation() + .getOperation(), InstanceConstants.InstanceOperation.UNKNOWN); // Check to make sure the throttle was enabled again after the swap was completed. @@ -681,7 +687,8 @@ public void testNodeSwapDisableAndReenable() throws Exception { Assert.assertFalse(_gSetupTool.getClusterManagementTool() .getInstanceConfig(CLUSTER_NAME, instanceToSwapOutName).getInstanceEnabled()); Assert.assertEquals(_gSetupTool.getClusterManagementTool() - .getInstanceConfig(CLUSTER_NAME, instanceToSwapOutName).getInstanceOperation(), + .getInstanceConfig(CLUSTER_NAME, instanceToSwapOutName).getInstanceOperation() + .getOperation(), InstanceConstants.InstanceOperation.UNKNOWN); // Validate that the SWAP_IN instance has the same partitions the swap out instance had before @@ -824,7 +831,7 @@ public void testNodeSwapCancelSwapWhenReadyToComplete() throws Exception { Collections.emptySet(), Collections.emptySet()); _gSetupTool.getClusterManagementTool() - .setInstanceOperation(CLUSTER_NAME, instanceToSwapOutName, null); + .setInstanceOperation(CLUSTER_NAME, instanceToSwapOutName, InstanceConstants.InstanceOperation.ENABLE); Assert.assertTrue(_clusterVerifier.verifyByPolling()); @@ -1110,7 +1117,7 @@ public void testUnsetInstanceOperationOnSwapInWhenSwapping() throws Exception { // This should throw exception because we cannot ever have two instances with the same logicalId and both have InstanceOperation // unset. _gSetupTool.getClusterManagementTool() - .setInstanceOperation(CLUSTER_NAME, instanceToSwapInName, null); + .setInstanceOperation(CLUSTER_NAME, instanceToSwapInName, InstanceConstants.InstanceOperation.ENABLE); } @Test(dependsOnMethods = "testUnsetInstanceOperationOnSwapInWhenSwapping") @@ -1180,7 +1187,7 @@ public void testEvacuateAndCancelBeforeBootstrapFinish() throws Exception { // cancel the evacuation _gSetupTool.getClusterManagementTool() - .setInstanceOperation(CLUSTER_NAME, instanceToEvacuate, null); + .setInstanceOperation(CLUSTER_NAME, instanceToEvacuate, InstanceConstants.InstanceOperation.ENABLE); assignment = getEVs(); for (String resource : _allDBs) { @@ -1222,7 +1229,7 @@ public void testEvacuateAndCancelBeforeDropFinish() throws Exception { // cancel evacuation _gSetupTool.getClusterManagementTool() - .setInstanceOperation(CLUSTER_NAME, instanceToEvacuate, null); + .setInstanceOperation(CLUSTER_NAME, instanceToEvacuate, InstanceConstants.InstanceOperation.ENABLE); // check every replica has >= 3 active replicas, even before cluster converge Map assignment = getEVs(); for (String resource : _allDBs) { @@ -1311,7 +1318,7 @@ public void testSwapEvacuateAddRemoveEvacuate() throws Exception { // Remove EVACUATE instance's InstanceOperation _gSetupTool.getClusterManagementTool() - .setInstanceOperation(CLUSTER_NAME, instanceToSwapOutName, null); + .setInstanceOperation(CLUSTER_NAME, instanceToSwapOutName, InstanceConstants.InstanceOperation.ENABLE); } @Test(dependsOnMethods = "testSwapEvacuateAddRemoveEvacuate") @@ -1392,7 +1399,7 @@ public boolean isThrottlesEnabled() { @Override public void onInstanceConfigChange(List instanceConfig, NotificationContext context) { - if (instanceConfig.get(0).getInstanceOperation() + if (instanceConfig.get(0).getInstanceOperation().getOperation() .equals(InstanceConstants.InstanceOperation.SWAP_IN)) { throttlesEnabled = false; } else { diff --git a/helix-core/src/test/java/org/apache/helix/manager/zk/TestZkHelixAdmin.java b/helix-core/src/test/java/org/apache/helix/manager/zk/TestZkHelixAdmin.java index 5581108578..b54be8c045 100644 --- a/helix-core/src/test/java/org/apache/helix/manager/zk/TestZkHelixAdmin.java +++ b/helix-core/src/test/java/org/apache/helix/manager/zk/TestZkHelixAdmin.java @@ -182,8 +182,7 @@ public void testZkHelixAdmin() { String disableReason = "Reason"; tool.enableInstance(clusterName, instanceName, false, InstanceConstants.InstanceDisabledType.CLOUD_EVENT, disableReason); - Assert.assertTrue(tool.getInstanceConfig(clusterName, instanceName).getInstanceDisabledReason() - .equals(disableReason)); + Assert.assertEquals(disableReason, tool.getInstanceConfig(clusterName, instanceName).getInstanceDisabledReason()); tool.enableInstance(clusterName, instanceName, true, InstanceConstants.InstanceDisabledType.CLOUD_EVENT, disableReason); Assert.assertTrue( @@ -348,6 +347,65 @@ public void testZkHelixAdmin() { System.out.println("END testZkHelixAdmin at " + new Date(System.currentTimeMillis())); } + @Test + private void testSetInstanceOperation() { + System.out.println("START testSetInstanceOperation at " + new Date(System.currentTimeMillis())); + + final String clusterName = getShortClassName(); + String rootPath = "/" + clusterName; + if (_gZkClient.exists(rootPath)) { + _gZkClient.deleteRecursively(rootPath); + } + + HelixAdmin tool = new ZKHelixAdmin(_gZkClient); + tool.addCluster(clusterName, true); + Assert.assertTrue(ZKUtil.isClusterSetup(clusterName, _gZkClient)); + Assert.assertTrue(_gZkClient.exists(PropertyPathBuilder.customizedStateConfig(clusterName))); + + // Add instance to cluster + String hostname = "host1"; + String port = "9999"; + String instanceName = hostname + "_" + port; + InstanceConfig config = + new InstanceConfig.Builder().setHostName(hostname).setPort(port).build(instanceName); + + tool.addInstance(clusterName, config); + + // Set instance operation to DISABLE + tool.setInstanceOperation(clusterName, instanceName, + InstanceConstants.InstanceOperation.DISABLE, "disableReason"); + Assert.assertEquals(tool.getInstanceConfig(clusterName, instanceName).getInstanceOperation() + .getOperation(), + InstanceConstants.InstanceOperation.DISABLE); + Assert.assertEquals( + tool.getInstanceConfig(clusterName, instanceName).getInstanceDisabledReason(), + "disableReason"); + + // Set instance operation to ENABLE + tool.setInstanceOperation(clusterName, instanceName, InstanceConstants.InstanceOperation.ENABLE, + "enableReason"); + Assert.assertEquals(tool.getInstanceConfig(clusterName, instanceName).getInstanceOperation() + .getOperation(), + InstanceConstants.InstanceOperation.ENABLE); + // InstanceNonServingReason should be empty after setting operation to ENABLE + Assert.assertEquals( + tool.getInstanceConfig(clusterName, instanceName).getInstanceDisabledReason(), ""); + + // Set instance operation to UNKNOWN + tool.setInstanceOperation(clusterName, instanceName, + InstanceConstants.InstanceOperation.UNKNOWN, "unknownReason"); + Assert.assertEquals(tool.getInstanceConfig(clusterName, instanceName).getInstanceOperation() + .getOperation(), + InstanceConstants.InstanceOperation.UNKNOWN); + Assert.assertEquals( + tool.getInstanceConfig(clusterName, instanceName).getInstanceOperation().getReason(), + "unknownReason"); + + deleteCluster(clusterName); + + System.out.println("END testSetInstanceOperation at " + new Date(System.currentTimeMillis())); + } + private HelixManager initializeHelixManager(String clusterName, String instanceName) { HelixManager manager = HelixManagerFactory.getZKHelixManager(clusterName, instanceName, InstanceType.PARTICIPANT, org.apache.helix.common.ZkTestBase.ZK_ADDR); diff --git a/helix-core/src/test/java/org/apache/helix/mock/MockHelixAdmin.java b/helix-core/src/test/java/org/apache/helix/mock/MockHelixAdmin.java index d9bc5d7fe6..c5c5626ff6 100644 --- a/helix-core/src/test/java/org/apache/helix/mock/MockHelixAdmin.java +++ b/helix-core/src/test/java/org/apache/helix/mock/MockHelixAdmin.java @@ -24,6 +24,8 @@ import java.util.List; import java.util.Map; +import javax.annotation.Nullable; + import org.apache.helix.BaseDataAccessor; import org.apache.helix.HelixAdmin; import org.apache.helix.HelixDataAccessor; @@ -285,16 +287,12 @@ public void enableInstance(String clusterName, String instanceName, boolean enab ZNRecord record = (ZNRecord) _baseDataAccessor.get(instanceConfigPath, null, 0); InstanceConfig instanceConfig = new InstanceConfig(record); - instanceConfig.setInstanceOperation(enabled ? InstanceConstants.InstanceOperation.ENABLE - : InstanceConstants.InstanceOperation.DISABLE); + instanceConfig.setInstanceOperation(new InstanceConfig.InstanceOperation.Builder().setOperation( + enabled ? InstanceConstants.InstanceOperation.ENABLE + : InstanceConstants.InstanceOperation.DISABLE).setReason(reason).build()); if (!enabled) { + // TODO: Replace this when the HELIX_ENABLED and HELIX_DISABLED fields are removed. instanceConfig.resetInstanceDisabledTypeAndReason(); - if (reason != null) { - instanceConfig.setInstanceDisabledReason(reason); - } - if (disabledType != null) { - instanceConfig.setInstanceDisabledType(disabledType); - } } _baseDataAccessor.set(instanceConfigPath, instanceConfig.getRecord(), 0); } @@ -307,7 +305,20 @@ public void enableInstance(String clusterName, List instances, boolean e @Override public void setInstanceOperation(String clusterName, String instanceName, - InstanceConstants.InstanceOperation instanceOperation) { + @Nullable InstanceConstants.InstanceOperation instanceOperation) { + setInstanceOperation(clusterName, instanceName, instanceOperation, null, false); + } + + @Override + public void setInstanceOperation(String clusterName, String instanceName, + @Nullable InstanceConstants.InstanceOperation instanceOperation, String reason) { + setInstanceOperation(clusterName, instanceName, instanceOperation, reason, false); + } + + @Override + public void setInstanceOperation(String clusterName, String instanceName, + @Nullable InstanceConstants.InstanceOperation instanceOperation, String reason, + boolean overrideAll) { } @Override diff --git a/helix-core/src/test/java/org/apache/helix/model/TestInstanceConfig.java b/helix-core/src/test/java/org/apache/helix/model/TestInstanceConfig.java index 7da983b8aa..47ea88ac4d 100644 --- a/helix-core/src/test/java/org/apache/helix/model/TestInstanceConfig.java +++ b/helix-core/src/test/java/org/apache/helix/model/TestInstanceConfig.java @@ -52,7 +52,6 @@ public void testGetParsedDomain() { public void testSetInstanceEnableWithReason() { InstanceConfig instanceConfig = new InstanceConfig(new ZNRecord("id")); instanceConfig.setInstanceOperation(InstanceConstants.InstanceOperation.ENABLE); - instanceConfig.setInstanceDisabledReason("NoShowReason"); instanceConfig.setInstanceDisabledType(InstanceConstants.InstanceDisabledType.USER_OPERATION); Assert.assertEquals(instanceConfig.getRecord().getSimpleFields() @@ -62,10 +61,9 @@ public void testSetInstanceEnableWithReason() { Assert.assertEquals(instanceConfig.getRecord().getSimpleFields() .get(InstanceConfig.InstanceConfigProperty.HELIX_DISABLED_TYPE.toString()), null); - - instanceConfig.setInstanceOperation(InstanceConstants.InstanceOperation.DISABLE); String reasonCode = "ReasonCode"; - instanceConfig.setInstanceDisabledReason(reasonCode); + instanceConfig.setInstanceOperation(new InstanceConfig.InstanceOperation.Builder().setOperation( + InstanceConstants.InstanceOperation.DISABLE).setReason(reasonCode).build()); instanceConfig.setInstanceDisabledType(InstanceConstants.InstanceDisabledType.USER_OPERATION); Assert.assertEquals(instanceConfig.getRecord().getSimpleFields() .get(InstanceConfig.InstanceConfigProperty.HELIX_ENABLED.toString()), "false"); @@ -198,6 +196,30 @@ public void testInstanceConfigBuilder() { Assert.assertEquals(instanceConfig.getInstanceCapacityMap().get("weight1"), Integer.valueOf(1)); } + @Test + public void testInstanceOperationReason() { + InstanceConfig instanceConfig = new InstanceConfig("instance1"); + instanceConfig.setInstanceEnabled(false); + instanceConfig.setInstanceDisabledReason("disableReason"); + Assert.assertEquals(instanceConfig.getInstanceDisabledReason(), "disableReason"); + Assert.assertEquals(instanceConfig.getInstanceDisabledReason(), "disableReason"); + + instanceConfig.setInstanceOperation(new InstanceConfig.InstanceOperation.Builder().setOperation( + InstanceConstants.InstanceOperation.UNKNOWN).setReason("unknownReason").build()); + Assert.assertEquals(instanceConfig.getInstanceDisabledReason(), "disableReason"); + Assert.assertEquals(instanceConfig.getInstanceOperation().getReason(), "unknownReason"); + + instanceConfig.setInstanceOperation(InstanceConstants.InstanceOperation.DISABLE); + instanceConfig.setInstanceOperation(new InstanceConfig.InstanceOperation.Builder().setOperation( + InstanceConstants.InstanceOperation.DISABLE).setReason("disableReason2").build()); + Assert.assertEquals(instanceConfig.getInstanceDisabledReason(), "disableReason2"); + Assert.assertEquals(instanceConfig.getInstanceOperation().getReason(), "disableReason2"); + + instanceConfig.setInstanceOperation(InstanceConstants.InstanceOperation.ENABLE); + Assert.assertEquals(instanceConfig.getInstanceDisabledReason(), ""); + Assert.assertEquals(instanceConfig.getInstanceOperation().getReason(), ""); + } + @Test public void testOverwriteInstanceConfig() { InstanceConfig instanceConfig = new InstanceConfig("instance2"); @@ -233,9 +255,91 @@ public void testOverwriteInstanceConfig() { Assert.assertTrue(instanceConfig.getTags().contains("tag4")); Assert.assertFalse(instanceConfig.getRecord().getSimpleFields() .containsKey(InstanceConfig.InstanceConfigProperty.HELIX_ENABLED.toString())); - Assert.assertEquals(instanceConfig.getInstanceOperation(), + Assert.assertEquals(instanceConfig.getInstanceOperation().getOperation(), InstanceConstants.InstanceOperation.EVACUATE); Assert.assertFalse(instanceConfig.getInstanceCapacityMap().containsKey("weight1")); Assert.assertEquals(instanceConfig.getInstanceCapacityMap().get("weight2"), Integer.valueOf(2)); } + + @Test + public void testInstanceOperationMultipleSources() throws InterruptedException { + InstanceConfig instanceConfig = new InstanceConfig("instance1"); + + // Check that the instance operation is ENABLE from the DEFAULT source + Assert.assertEquals(instanceConfig.getInstanceOperation().getOperation(), + InstanceConstants.InstanceOperation.ENABLE); + Assert.assertEquals(instanceConfig.getInstanceOperation().getSource(), + InstanceConstants.InstanceOperationSource.DEFAULT); + + // Set instance operation from user source + instanceConfig.setInstanceOperation(new InstanceConfig.InstanceOperation.Builder().setOperation( + InstanceConstants.InstanceOperation.DISABLE).setReason("userReason") + .setSource(InstanceConstants.InstanceOperationSource.USER).build()); + // Get enabled time + long op1EnabledTime = instanceConfig.getInstanceEnabledTime(); + + Thread.sleep(1000); + // Set instance operation from automation source + instanceConfig.setInstanceOperation(new InstanceConfig.InstanceOperation.Builder().setOperation( + InstanceConstants.InstanceOperation.DISABLE).setReason("automationReason") + .setSource(InstanceConstants.InstanceOperationSource.AUTOMATION).build()); + + // Check that the enabled time is the same as op1 but the source and reason is changed to automation + Assert.assertEquals(instanceConfig.getInstanceEnabledTime(), op1EnabledTime); + Assert.assertEquals(instanceConfig.getInstanceOperation().getSource(), + InstanceConstants.InstanceOperationSource.AUTOMATION); + + Thread.sleep(1000); + // Set instance operation from user source to be ENABLE + instanceConfig.setInstanceOperation(new InstanceConfig.InstanceOperation.Builder().setOperation( + InstanceConstants.InstanceOperation.ENABLE) + .setSource(InstanceConstants.InstanceOperationSource.USER).build()); + + // Check that the operation is DISABLE, the enabled time is the same as op1, and the source is still automation + Assert.assertEquals(instanceConfig.getInstanceOperation().getOperation(), + InstanceConstants.InstanceOperation.DISABLE); + Assert.assertEquals(instanceConfig.getInstanceEnabledTime(), op1EnabledTime); + Assert.assertEquals(instanceConfig.getInstanceOperation().getSource(), + InstanceConstants.InstanceOperationSource.AUTOMATION); + + Thread.sleep(1000); + // Set the instance operation from the automation source to be ENABLE + instanceConfig.setInstanceOperation(new InstanceConfig.InstanceOperation.Builder().setOperation( + InstanceConstants.InstanceOperation.ENABLE) + .setSource(InstanceConstants.InstanceOperationSource.AUTOMATION).build()); + + // Check that the operation is ENABLE, the enabled time is the different from op1, and the source is still automation + Assert.assertEquals(instanceConfig.getInstanceOperation().getOperation(), + InstanceConstants.InstanceOperation.ENABLE); + Assert.assertFalse(instanceConfig.getInstanceEnabledTime() == op1EnabledTime); + Assert.assertEquals(instanceConfig.getInstanceOperation().getSource(), + InstanceConstants.InstanceOperationSource.AUTOMATION); + + // Set the instance operation from the automation source to be EVACUATE + instanceConfig.setInstanceOperation(new InstanceConfig.InstanceOperation.Builder().setOperation( + InstanceConstants.InstanceOperation.EVACUATE) + .setSource(InstanceConstants.InstanceOperationSource.AUTOMATION).build()); + + // Set the instance operation from the user source to be DISABLE + instanceConfig.setInstanceOperation(new InstanceConfig.InstanceOperation.Builder().setOperation( + InstanceConstants.InstanceOperation.DISABLE) + .setSource(InstanceConstants.InstanceOperationSource.USER).build()); + + // Check that the instance operation is DISABLE and the source is user + Assert.assertEquals(instanceConfig.getInstanceOperation().getOperation(), + InstanceConstants.InstanceOperation.DISABLE); + Assert.assertEquals(instanceConfig.getInstanceOperation().getSource(), + InstanceConstants.InstanceOperationSource.USER); + + // Set the instance operation from the admin source to be ENABLE + instanceConfig.setInstanceOperation(new InstanceConfig.InstanceOperation.Builder().setOperation( + InstanceConstants.InstanceOperation.ENABLE) + .setSource(InstanceConstants.InstanceOperationSource.ADMIN).build()); + + // Check that the instance operation is ENABLE and the source is admin + Assert.assertEquals(instanceConfig.getInstanceOperation().getOperation(), + InstanceConstants.InstanceOperation.ENABLE); + Assert.assertEquals(instanceConfig.getInstanceOperation().getSource(), + InstanceConstants.InstanceOperationSource.ADMIN); + } } diff --git a/helix-rest/src/main/java/org/apache/helix/rest/clusterMaintenanceService/StoppableInstancesSelector.java b/helix-rest/src/main/java/org/apache/helix/rest/clusterMaintenanceService/StoppableInstancesSelector.java index 8a4bbf07bd..bb5a2bc5c4 100644 --- a/helix-rest/src/main/java/org/apache/helix/rest/clusterMaintenanceService/StoppableInstancesSelector.java +++ b/helix-rest/src/main/java/org/apache/helix/rest/clusterMaintenanceService/StoppableInstancesSelector.java @@ -268,8 +268,8 @@ private void collectEvacuatingInstances(Set toBeStoppedInstances) { PropertyKey.Builder propertyKeyBuilder = _dataAccessor.keyBuilder(); InstanceConfig instanceConfig = _dataAccessor.getProperty(propertyKeyBuilder.instanceConfig(instance)); - if (InstanceConstants.InstanceOperation.EVACUATE - .equals(instanceConfig.getInstanceOperation())) { + if (InstanceConstants.InstanceOperation.EVACUATE.equals( + instanceConfig.getInstanceOperation().getOperation())) { toBeStoppedInstances.add(instance); } } diff --git a/helix-rest/src/main/java/org/apache/helix/rest/server/resources/helix/PerInstanceAccessor.java b/helix-rest/src/main/java/org/apache/helix/rest/server/resources/helix/PerInstanceAccessor.java index ea98f66371..55fc4de36e 100644 --- a/helix-rest/src/main/java/org/apache/helix/rest/server/resources/helix/PerInstanceAccessor.java +++ b/helix-rest/src/main/java/org/apache/helix/rest/server/resources/helix/PerInstanceAccessor.java @@ -45,12 +45,14 @@ import com.fasterxml.jackson.databind.node.JsonNodeFactory; import com.fasterxml.jackson.databind.node.ObjectNode; import com.google.common.collect.ImmutableMap; +import org.apache.helix.BaseDataAccessor; import org.apache.helix.ConfigAccessor; import org.apache.helix.HelixAdmin; import org.apache.helix.HelixDataAccessor; import org.apache.helix.HelixException; import org.apache.helix.constants.InstanceConstants; import org.apache.helix.manager.zk.ZKHelixDataAccessor; +import org.apache.helix.manager.zk.ZkBaseDataAccessor; import org.apache.helix.model.CurrentState; import org.apache.helix.model.Error; import org.apache.helix.model.HealthStat; @@ -66,6 +68,7 @@ import org.apache.helix.rest.server.filters.ClusterAuth; import org.apache.helix.rest.server.json.instance.InstanceInfo; import org.apache.helix.rest.server.json.instance.StoppableCheck; +import org.apache.helix.util.InstanceUtil; import org.apache.helix.zookeeper.datamodel.ZNRecord; import org.eclipse.jetty.util.StringUtil; import org.slf4j.Logger; @@ -388,9 +391,11 @@ record = toZNRecord(content); @POST public Response updateInstance(@PathParam("clusterId") String clusterId, @PathParam("instanceName") String instanceName, @QueryParam("command") String command, - @QueryParam("instanceOperation") InstanceConstants.InstanceOperation state, - @QueryParam("instanceDisabledType") String disabledType, - @QueryParam("instanceDisabledReason") String disabledReason, + @QueryParam("instanceOperation") InstanceConstants.InstanceOperation instanceOperation, + @QueryParam("instanceOperationSource") InstanceConstants.InstanceOperationSource instanceOperationSource, + @QueryParam("reason") String reason, + @Deprecated @QueryParam("instanceDisabledType") String disabledType, + @Deprecated @QueryParam("instanceDisabledReason") String disabledReason, @QueryParam("force") boolean force, String content) { Command cmd; try { @@ -445,7 +450,12 @@ public Response updateInstance(@PathParam("clusterId") String clusterId, .getTypeFactory().constructCollectionType(List.class, String.class))); break; case setInstanceOperation: - admin.setInstanceOperation(clusterId, instanceName, state); + InstanceUtil.setInstanceOperation(new ConfigAccessor(getRealmAwareZkClient()), + new ZkBaseDataAccessor<>(getRealmAwareZkClient()), clusterId, instanceName, + new InstanceConfig.InstanceOperation.Builder().setOperation(instanceOperation) + .setReason(reason).setSource( + force ? InstanceConstants.InstanceOperationSource.ADMIN : instanceOperationSource) + .build()); break; case canCompleteSwap: return OK(OBJECT_MAPPER.writeValueAsString( diff --git a/helix-rest/src/test/java/org/apache/helix/rest/server/TestPerInstanceAccessor.java b/helix-rest/src/test/java/org/apache/helix/rest/server/TestPerInstanceAccessor.java index 395f9bf858..6ab727e85e 100644 --- a/helix-rest/src/test/java/org/apache/helix/rest/server/TestPerInstanceAccessor.java +++ b/helix-rest/src/test/java/org/apache/helix/rest/server/TestPerInstanceAccessor.java @@ -495,14 +495,14 @@ public void updateInstance() throws Exception { new JerseyUriRequestBuilder("clusters/{}/instances/{}?command=setInstanceOperation&instanceOperation=EVACUATE") .format(CLUSTER_NAME, INSTANCE_NAME).post(this, entity); instanceConfig = _configAccessor.getInstanceConfig(CLUSTER_NAME, INSTANCE_NAME); - Assert.assertEquals(instanceConfig.getInstanceOperation(), + Assert.assertEquals(instanceConfig.getInstanceOperation().getOperation(), InstanceConstants.InstanceOperation.EVACUATE); new JerseyUriRequestBuilder("clusters/{}/instances/{}?command=setInstanceOperation&instanceOperation=INVALIDOP") .expectedReturnStatusCode(Response.Status.NOT_FOUND.getStatusCode()).format(CLUSTER_NAME, INSTANCE_NAME).post(this, entity); new JerseyUriRequestBuilder("clusters/{}/instances/{}?command=setInstanceOperation&instanceOperation=") .format(CLUSTER_NAME, INSTANCE_NAME).post(this, entity); instanceConfig = _configAccessor.getInstanceConfig(CLUSTER_NAME, INSTANCE_NAME); - Assert.assertEquals(instanceConfig.getInstanceOperation(), + Assert.assertEquals(instanceConfig.getInstanceOperation().getOperation(), InstanceConstants.InstanceOperation.ENABLE); // test canCompleteSwap @@ -543,7 +543,7 @@ public void updateInstance() throws Exception { new JerseyUriRequestBuilder("clusters/{}/instances/{}?command=setInstanceOperation&instanceOperation=EVACUATE") .format(CLUSTER_NAME, INSTANCE_NAME).post(this, entity); instanceConfig = _configAccessor.getInstanceConfig(CLUSTER_NAME, INSTANCE_NAME); - Assert.assertEquals(instanceConfig.getInstanceOperation(), + Assert.assertEquals(instanceConfig.getInstanceOperation().getOperation(), InstanceConstants.InstanceOperation.EVACUATE); Response response = new JerseyUriRequestBuilder("clusters/{}/instances/{}?command=isEvacuateFinished") @@ -586,7 +586,7 @@ public void updateInstance() throws Exception { new JerseyUriRequestBuilder("clusters/{}/instances/{}?command=setInstanceOperation&instanceOperation=EVACUATE") .format(CLUSTER_NAME, test_instance_name).post(this, entity); instanceConfig = _configAccessor.getInstanceConfig(CLUSTER_NAME, test_instance_name); - Assert.assertEquals(instanceConfig.getInstanceOperation(), + Assert.assertEquals(instanceConfig.getInstanceOperation().getOperation(), InstanceConstants.InstanceOperation.EVACUATE); response = new JerseyUriRequestBuilder("clusters/{}/instances/{}?command=isEvacuateFinished") From 55bdc0800a0b6f077d084c593a22931f2a059c79 Mon Sep 17 00:00:00 2001 From: xyuanlu Date: Tue, 11 Jun 2024 13:52:13 -0700 Subject: [PATCH 11/11] Fix config for metaclient leader election client. (#2807) --- .../recipes/leaderelection/LeaderElectionClient.java | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/meta-client/src/main/java/org/apache/helix/metaclient/recipes/leaderelection/LeaderElectionClient.java b/meta-client/src/main/java/org/apache/helix/metaclient/recipes/leaderelection/LeaderElectionClient.java index 373d360132..ae7d9c9fab 100644 --- a/meta-client/src/main/java/org/apache/helix/metaclient/recipes/leaderelection/LeaderElectionClient.java +++ b/meta-client/src/main/java/org/apache/helix/metaclient/recipes/leaderelection/LeaderElectionClient.java @@ -97,7 +97,13 @@ public LeaderElectionClient(MetaClientConfig metaClientConfig, String participan LOG.info("Creating MetaClient for LeaderElectionClient"); if (MetaClientConfig.StoreType.ZOOKEEPER.equals(metaClientConfig.getStoreType())) { ZkMetaClientConfig zkMetaClientConfig = new ZkMetaClientConfig.ZkMetaClientConfigBuilder().setConnectionAddress( - metaClientConfig.getConnectionAddress()).setZkSerializer((new LeaderInfoSerializer())).build(); + metaClientConfig.getConnectionAddress()) + .setZkSerializer((new LeaderInfoSerializer())) + .setSessionTimeoutInMillis(metaClientConfig.getSessionTimeoutInMillis()) + .setMetaClientReconnectPolicy(metaClientConfig.getMetaClientReconnectPolicy()) + .setConnectionInitTimeoutInMillis(metaClientConfig.getConnectionInitTimeoutInMillis()) + .setAuthEnabled(metaClientConfig.isAuthEnabled()) + .build(); _metaClient = new ZkMetaClientFactory().getMetaClient(zkMetaClientConfig); _metaClient.connect(); _metaClient.subscribeStateChanges(_connectStateListener);