diff --git a/helix-common/pom.xml b/helix-common/pom.xml index 40f64043b6..82e35c86dd 100644 --- a/helix-common/pom.xml +++ b/helix-common/pom.xml @@ -89,6 +89,54 @@ + + org.apache.maven.plugins + maven-compiler-plugin + 3.12.1 + + + JDK 8 + compile + + compile + + + ${project.build.outputDirectory}_jdk8 + 8 + true + + + + JDK 11 + compile + + compile + + + 11 + true + + + + + + org.apache.maven.plugins + maven-jar-plugin + 3.3.0 + + + default-package-jdk11 + package + + jar + + + ${project.build.outputDirectory}_jdk8 + jdk8 + + + + org.apache.maven.plugins maven-assembly-plugin diff --git a/helix-common/src/main/java/org/apache/helix/constants/InstanceConstants.java b/helix-common/src/main/java/org/apache/helix/constants/InstanceConstants.java index e2cc2de2d5..22f6c7c76f 100644 --- a/helix-common/src/main/java/org/apache/helix/constants/InstanceConstants.java +++ b/helix-common/src/main/java/org/apache/helix/constants/InstanceConstants.java @@ -1,17 +1,103 @@ package org.apache.helix.constants; +import java.util.Set; + +import com.google.common.collect.ImmutableSet; + public class InstanceConstants { public static final String INSTANCE_NOT_DISABLED = "INSTANCE_NOT_DISABLED"; + /** + * The set contains the InstanceOperations that are allowed to be assigned replicas by the rebalancer. + */ + public static final Set ASSIGNABLE_INSTANCE_OPERATIONS = + ImmutableSet.of(InstanceOperation.ENABLE, InstanceOperation.DISABLE); + + + /** + * The set contains the InstanceOperations that are overridden when the deprecated HELIX_ENABLED + * field is set to false. This will maintain backwards compatibility with the deprecated field. + * TODO: Remove this when the deprecated HELIX_ENABLED is removed. + */ + public static final Set INSTANCE_DISABLED_OVERRIDABLE_OPERATIONS = + ImmutableSet.of(InstanceOperation.ENABLE, InstanceOperation.EVACUATE); + + + /** + * The set of InstanceOperations that are not allowed to be populated in the RoutingTableProvider. + */ + public static final Set UNROUTABLE_INSTANCE_OPERATIONS = + ImmutableSet.of(InstanceOperation.SWAP_IN, InstanceOperation.UNKNOWN); + + @Deprecated public enum InstanceDisabledType { CLOUD_EVENT, USER_OPERATION, DEFAULT_INSTANCE_DISABLE_TYPE } + public enum InstanceOperationSource { + ADMIN(0), USER(1), AUTOMATION(2), DEFAULT(3); + + private final int _priority; + + InstanceOperationSource(int priority) { + _priority = priority; + } + + public int getPriority() { + return _priority; + } + + /** + * Convert from InstanceDisabledType to InstanceOperationTrigger + * + * @param disabledType InstanceDisabledType + * @return InstanceOperationTrigger + */ + public static InstanceOperationSource instanceDisabledTypeToInstanceOperationSource( + InstanceDisabledType disabledType) { + switch (disabledType) { + case CLOUD_EVENT: + return InstanceOperationSource.AUTOMATION; + case USER_OPERATION: + return InstanceOperationSource.USER; + default: + return InstanceOperationSource.DEFAULT; + } + } + } + public enum InstanceOperation { - EVACUATE, // Node will be removed after a period of time - SWAP_IN, // New node joining for swap operation - SWAP_OUT // Existing Node to be removed for swap operation + /** + * Behavior: Replicas will be assigned to the node and will receive upward state transitions if + * for new assignments and downward state transitions if replicas are being moved elsewhere. + * Final State: The node will have replicas assigned to it and will be considered for future assignment. + */ + ENABLE, + /** + * Behavior: All replicas on the node will be set to OFFLINE. + * Final State: The node will have all replicas in the OFFLINE state and can't take new assignment. + */ + DISABLE, + /** + * Behavior: All replicas will be moved off the node, after a replacement has been bootstrapped + * in another node in the cluster. + * Final State: The node will not contain any replicas and will not be considered for *NEW* assignment. + */ + EVACUATE, + /** + * Behavior: Node will have all replicas on its corresponding(same logicalId) swap-out node bootstrapped + * (ERROR and OFFLINE replicas on swap-out node will not be bootstrapped) to the same states if the StateModelDef allows. + * This node will be excluded from the RoutingTableProvider. + * Final State: This node will be a mirror the swap-out node, will not be considered for assignment, and will not be populated + * in the RoutingTableProvider. + */ + SWAP_IN, + /** + * Behavior: Node will have all of its replicas dropped immediately and will be removed from the RoutingTableProvider. + * Final State: Node will not hold replicas, be considered for assignment, or be populated in the RoutingTableProvider. + */ + UNKNOWN } } diff --git a/helix-core/pom.xml b/helix-core/pom.xml index ba3d7b7e51..c8c5a504d2 100644 --- a/helix-core/pom.xml +++ b/helix-core/pom.xml @@ -219,7 +219,6 @@ package jar - test-jar ${project.build.outputDirectory}_jdk8 diff --git a/helix-core/src/main/java/org/apache/helix/HelixAdmin.java b/helix-core/src/main/java/org/apache/helix/HelixAdmin.java index 021332be7a..07afb55b6f 100644 --- a/helix-core/src/main/java/org/apache/helix/HelixAdmin.java +++ b/helix-core/src/main/java/org/apache/helix/HelixAdmin.java @@ -281,9 +281,11 @@ void addResource(String clusterName, String resourceName, int numPartitions, Str * @param instanceName * @param enabled */ + @Deprecated void enableInstance(String clusterName, String instanceName, boolean enabled); /** + * @deprecated use {@link #setInstanceOperation(String, String, InstanceConstants.InstanceOperation)} * @param clusterName * @param instanceName * @param enabled @@ -292,27 +294,54 @@ void addResource(String clusterName, String resourceName, int numPartitions, Str * @param reason set additional string description on why the instance is disabled when * enabled is false. Existing disabled reason will be over write if instance is in disabled state. */ + @Deprecated void enableInstance(String clusterName, String instanceName, boolean enabled, InstanceConstants.InstanceDisabledType disabledType, String reason); /** * Batch enable/disable instances in a cluster * By default, all the instances are enabled + * @deprecated use {@link #setInstanceOperation(String, String, InstanceConstants.InstanceOperation)} * @param clusterName * @param instances * @param enabled */ + @Deprecated void enableInstance(String clusterName, List instances, boolean enabled); /** - * Set the instanceOperation field. + * Set the instanceOperation of and instance with {@link InstanceConstants.InstanceOperation}. * * @param clusterName The cluster name * @param instanceName The instance name - * @param instanceOperation The instance operation + * @param instanceOperation The instance operation type */ void setInstanceOperation(String clusterName, String instanceName, - @Nullable InstanceConstants.InstanceOperation instanceOperation); + InstanceConstants.InstanceOperation instanceOperation); + + /** + * Set the instanceOperation of and instance with {@link InstanceConstants.InstanceOperation}. + * + * @param clusterName The cluster name + * @param instanceName The instance name + * @param instanceOperation The instance operation type + * @param reason The reason for the operation + */ + void setInstanceOperation(String clusterName, String instanceName, + InstanceConstants.InstanceOperation instanceOperation, String reason); + + /** + * Set the instanceOperation of and instance with {@link InstanceConstants.InstanceOperation}. + * + * @param clusterName The cluster name + * @param instanceName The instance name + * @param instanceOperation The instance operation type + * @param reason The reason for the operation + * @param overrideAll Whether to override all existing instance operations from all other + * instance operations + */ + void setInstanceOperation(String clusterName, String instanceName, + InstanceConstants.InstanceOperation instanceOperation, String reason, boolean overrideAll); /** * Disable or enable a resource @@ -415,6 +444,18 @@ void manuallyEnableMaintenanceMode(String clusterName, boolean enabled, String r */ ClusterManagementMode getClusterManagementMode(String clusterName); + /** + * Set a list of partitions for an instance to ERROR state from any state. + * The partitions could be in any state and setPartitionsToError will bring them to ERROR + * state. ANY to ERROR state transition is required for this. + * @param clusterName + * @param instanceName + * @param resourceName + * @param partitionNames + */ + void setPartitionsToError(String clusterName, String instanceName, String resourceName, + List partitionNames); + /** * Reset a list of partitions in error state for an instance * The partitions are assume to be in error state and reset will bring them from error diff --git a/helix-core/src/main/java/org/apache/helix/cloud/event/helix/DefaultCloudEventCallbackImpl.java b/helix-core/src/main/java/org/apache/helix/cloud/event/helix/DefaultCloudEventCallbackImpl.java index 04ad4b798a..20c5001164 100644 --- a/helix-core/src/main/java/org/apache/helix/cloud/event/helix/DefaultCloudEventCallbackImpl.java +++ b/helix-core/src/main/java/org/apache/helix/cloud/event/helix/DefaultCloudEventCallbackImpl.java @@ -23,6 +23,8 @@ import org.apache.helix.HelixManager; import org.apache.helix.constants.InstanceConstants; import org.apache.helix.model.ClusterConfig; +import org.apache.helix.model.InstanceConfig; +import org.apache.helix.util.InstanceUtil; import org.apache.helix.util.InstanceValidationUtil; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -49,9 +51,14 @@ public void disableInstance(HelixManager manager, Object eventInfo) { LOG.info("DefaultCloudEventCallbackImpl disable Instance {}", manager.getInstanceName()); if (InstanceValidationUtil .isEnabled(manager.getHelixDataAccessor(), manager.getInstanceName())) { - manager.getClusterManagmentTool() - .enableInstance(manager.getClusterName(), manager.getInstanceName(), false, - InstanceConstants.InstanceDisabledType.CLOUD_EVENT, message); + InstanceUtil.setInstanceOperation(manager.getConfigAccessor(), + manager.getHelixDataAccessor().getBaseDataAccessor(), manager.getClusterName(), + manager.getInstanceName(), + new InstanceConfig.InstanceOperation.Builder().setOperation( + InstanceConstants.InstanceOperation.DISABLE) + .setSource(InstanceConstants.InstanceOperationSource.AUTOMATION) + .setReason(message) + .build()); } HelixEventHandlingUtil.updateCloudEventOperationInClusterConfig(manager.getClusterName(), manager.getInstanceName(), manager.getHelixDataAccessor().getBaseDataAccessor(), false, @@ -72,10 +79,13 @@ public void enableInstance(HelixManager manager, Object eventInfo) { HelixEventHandlingUtil .updateCloudEventOperationInClusterConfig(manager.getClusterName(), instanceName, manager.getHelixDataAccessor().getBaseDataAccessor(), true, message); - if (HelixEventHandlingUtil.isInstanceDisabledForCloudEvent(instanceName, accessor)) { - manager.getClusterManagmentTool().enableInstance(manager.getClusterName(), instanceName, true, - InstanceConstants.InstanceDisabledType.CLOUD_EVENT, message); - } + InstanceUtil.setInstanceOperation(manager.getConfigAccessor(), + manager.getHelixDataAccessor().getBaseDataAccessor(), manager.getClusterName(), + manager.getInstanceName(), + new InstanceConfig.InstanceOperation.Builder().setOperation( + InstanceConstants.InstanceOperation.ENABLE) + .setSource(InstanceConstants.InstanceOperationSource.AUTOMATION).setReason(message) + .build()); } /** diff --git a/helix-core/src/main/java/org/apache/helix/cloud/event/helix/HelixEventHandlingUtil.java b/helix-core/src/main/java/org/apache/helix/cloud/event/helix/HelixEventHandlingUtil.java index ee96a13ee7..ceff1d299c 100644 --- a/helix-core/src/main/java/org/apache/helix/cloud/event/helix/HelixEventHandlingUtil.java +++ b/helix-core/src/main/java/org/apache/helix/cloud/event/helix/HelixEventHandlingUtil.java @@ -48,7 +48,10 @@ class HelixEventHandlingUtil { * @param dataAccessor * @return return true only when instance is Helix disabled and the disabled reason in * instanceConfig is cloudEvent + * @deprecated No need to check this if using InstanceOperation and specifying the trigger as CLOUD + * when enabling. */ + @Deprecated static boolean isInstanceDisabledForCloudEvent(String instanceName, HelixDataAccessor dataAccessor) { InstanceConfig instanceConfig = diff --git a/helix-core/src/main/java/org/apache/helix/controller/changedetector/trimmer/InstanceConfigTrimmer.java b/helix-core/src/main/java/org/apache/helix/controller/changedetector/trimmer/InstanceConfigTrimmer.java index 45b0dde766..cd2b16f922 100644 --- a/helix-core/src/main/java/org/apache/helix/controller/changedetector/trimmer/InstanceConfigTrimmer.java +++ b/helix-core/src/main/java/org/apache/helix/controller/changedetector/trimmer/InstanceConfigTrimmer.java @@ -19,6 +19,7 @@ * under the License. */ +import java.util.HashMap; import java.util.Map; import java.util.Set; @@ -62,6 +63,21 @@ protected Map> getNonTrimmableFields(InstanceConfig insta return STATIC_TOPOLOGY_RELATED_FIELD_MAP; } + /** + * We should trim HELIX_INSTANCE_OPERATIONS field, it is used to filter instances in the + * BaseControllerDataProvider. That filtering will be used to determine if ResourceChangeSnapshot + * has changed as opposed to checking the actual value of the field. + * + * @param property the instance config + * @return a map contains all non-trimmable field keys that need to be kept. + */ + protected Map> getNonTrimmableKeys(InstanceConfig property) { + Map> nonTrimmableKeys = super.getNonTrimmableKeys(property); + nonTrimmableKeys.get(FieldType.LIST_FIELD) + .remove(InstanceConfigProperty.HELIX_INSTANCE_OPERATIONS.name()); + return nonTrimmableKeys; + } + @Override public InstanceConfig trimProperty(InstanceConfig property) { return new InstanceConfig(doTrim(property)); diff --git a/helix-core/src/main/java/org/apache/helix/controller/dataproviders/BaseControllerDataProvider.java b/helix-core/src/main/java/org/apache/helix/controller/dataproviders/BaseControllerDataProvider.java index 1e40bbb720..ce5d3de8c7 100644 --- a/helix-core/src/main/java/org/apache/helix/controller/dataproviders/BaseControllerDataProvider.java +++ b/helix-core/src/main/java/org/apache/helix/controller/dataproviders/BaseControllerDataProvider.java @@ -64,7 +64,6 @@ import org.apache.helix.model.StateModelDefinition; import org.apache.helix.task.TaskConstants; import org.apache.helix.util.HelixUtil; -import org.apache.helix.util.InstanceValidationUtil; import org.apache.helix.zookeeper.datamodel.ZNRecord; import org.apache.helix.zookeeper.zkclient.DataUpdater; import org.slf4j.Logger; @@ -120,30 +119,40 @@ public class BaseControllerDataProvider implements ControlContextProvider { private final Set _disabledInstanceSet = new HashSet<>(); private static final class DerivedInstanceCache { - // Assignable instances are instances will contain at most one instance with a given logicalId. - // This is used for SWAP related operations where there can be two instances with the same logicalId. + private final Map> + _instanceConfigMapByInstanceOperation; private final Map _assignableInstanceConfigMap; private final Map _assignableLiveInstancesMap; private final Map _swapOutInstanceNameToSwapInInstanceName; + private final Map _swapInInstanceNameToSwapOutInstanceName; private final Set _liveSwapInInstanceNames; - private final Set _enabledSwapInInstanceNames; - DerivedInstanceCache(Map assignableInstanceConfigMap, + DerivedInstanceCache( + Map> instanceConfigMapByInstanceOperation, + Map assignableInstanceConfigMap, Map assignableLiveInstancesMap, Map swapOutInstanceNameToSwapInInstanceName, - Set liveSwapInInstanceNames, Set enabledSwapInInstanceNames) { + Set liveSwapInInstanceNames) { + _instanceConfigMapByInstanceOperation = instanceConfigMapByInstanceOperation; _assignableInstanceConfigMap = assignableInstanceConfigMap; _assignableLiveInstancesMap = assignableLiveInstancesMap; _swapOutInstanceNameToSwapInInstanceName = swapOutInstanceNameToSwapInInstanceName; + _swapInInstanceNameToSwapOutInstanceName = swapOutInstanceNameToSwapInInstanceName.entrySet() + .stream().collect(Collectors.toMap(Map.Entry::getValue, Map.Entry::getKey)); _liveSwapInInstanceNames = liveSwapInInstanceNames; - _enabledSwapInInstanceNames = enabledSwapInInstanceNames; + } + + private Map getInstanceConfigMapByInstanceOperation( + InstanceConstants.InstanceOperation instanceOperation) { + return _instanceConfigMapByInstanceOperation.getOrDefault(instanceOperation, + Collections.emptyMap()); } } // All maps and sets are encapsulated in DerivedInstanceCache to ensure that they are updated together // as a snapshot. private DerivedInstanceCache _derivedInstanceCache = - new DerivedInstanceCache(new HashMap<>(), new HashMap<>(), new HashMap<>(), new HashSet<>(), + new DerivedInstanceCache(new HashMap<>(), new HashMap<>(), new HashMap<>(), new HashMap<>(), new HashSet<>()); private final Map _abnormalStateResolverMap = new HashMap<>(); private final Set _timedOutInstanceDuringMaintenance = new HashSet<>(); @@ -383,15 +392,14 @@ private void updateInstanceSets(Map instanceConfigMap, ClusterTopologyConfig.createFromClusterConfig(clusterConfig); // Create new caches to be populated. + Map> + newInstanceConfigMapByInstanceOperation = new HashMap<>(); Map newAssignableInstanceConfigMap = new HashMap<>(); Map newAssignableLiveInstancesMap = new HashMap<>(); - Map newSwapOutInstanceNameToSwapInInstanceName = new HashMap<>(); + Map newSwapOutInstanceNameToSwapOutInstanceName = new HashMap<>(); Set newLiveSwapInInstanceNames = new HashSet<>(); - Set newEnabledSwapInInstanceNames = new HashSet<>(); - - Map filteredInstancesByLogicalId = new HashMap<>(); - Map swapOutLogicalIdsByInstanceName = new HashMap<>(); - Map swapInInstancesByLogicalId = new HashMap<>(); + Map swapInLogicalIdsByInstanceName = new HashMap<>(); + Map nonSwapInInstancesByLogicalId = new HashMap<>(); for (Map.Entry entry : instanceConfigMap.entrySet()) { String node = entry.getKey(); @@ -404,44 +412,21 @@ private void updateInstanceSets(Map instanceConfigMap, String currentInstanceLogicalId = currentInstanceConfig.getLogicalId(clusterTopologyConfig.getEndNodeType()); - // Filter out instances with duplicate logical IDs. If there are duplicates, the instance with - // InstanceOperation SWAP_OUT will be chosen over the instance with SWAP_IN. SWAP_IN is not - // assignable. If there are duplicates with one node having no InstanceOperation and the other - // having SWAP_OUT, the node with no InstanceOperation will be chosen. This signifies SWAP - // completion, therefore making the node assignable. - if (filteredInstancesByLogicalId.containsKey(currentInstanceLogicalId)) { - String filteredNode = filteredInstancesByLogicalId.get(currentInstanceLogicalId); - InstanceConfig filteredDuplicateInstanceConfig = instanceConfigMap.get(filteredNode); - - if ((filteredDuplicateInstanceConfig.getInstanceOperation() - .equals(InstanceConstants.InstanceOperation.SWAP_IN.name()) - && currentInstanceConfig.getInstanceOperation() - .equals(InstanceConstants.InstanceOperation.SWAP_OUT.name())) - || currentInstanceConfig.getInstanceOperation().isEmpty()) { - // If the already filtered instance is SWAP_IN and this instance is in SWAP_OUT, then replace the filtered - // instance with this instance. If this instance has no InstanceOperation, then replace the filtered instance - // with this instance. This is the case where the SWAP_IN node has been marked as complete or SWAP_IN exists and - // SWAP_OUT does not. There can never be a case where both have no InstanceOperation set. - newAssignableInstanceConfigMap.remove(filteredNode); - newAssignableInstanceConfigMap.put(node, currentInstanceConfig); - filteredInstancesByLogicalId.put(currentInstanceLogicalId, node); - } - } else if (!currentInstanceConfig.getInstanceOperation() - .equals(InstanceConstants.InstanceOperation.EVACUATE.name())) { - // EVACUATE instances are not considered to be assignable. + newInstanceConfigMapByInstanceOperation.computeIfAbsent( + currentInstanceConfig.getInstanceOperation().getOperation(), + k -> new HashMap<>()) + .put(node, currentInstanceConfig); + + if (currentInstanceConfig.isAssignable()) { newAssignableInstanceConfigMap.put(node, currentInstanceConfig); - filteredInstancesByLogicalId.put(currentInstanceLogicalId, node); } - if (currentInstanceConfig.getInstanceOperation() - .equals(InstanceConstants.InstanceOperation.SWAP_OUT.name())) { - swapOutLogicalIdsByInstanceName.put(currentInstanceConfig.getInstanceName(), + if (currentInstanceConfig.getInstanceOperation().getOperation() + .equals(InstanceConstants.InstanceOperation.SWAP_IN)) { + swapInLogicalIdsByInstanceName.put(currentInstanceConfig.getInstanceName(), currentInstanceLogicalId); - } - - if (currentInstanceConfig.getInstanceOperation() - .equals(InstanceConstants.InstanceOperation.SWAP_IN.name())) { - swapInInstancesByLogicalId.put( + } else { + nonSwapInInstancesByLogicalId.put( currentInstanceConfig.getLogicalId(clusterTopologyConfig.getEndNodeType()), currentInstanceConfig.getInstanceName()); } @@ -453,25 +438,20 @@ private void updateInstanceSets(Map instanceConfigMap, } }); - swapOutLogicalIdsByInstanceName.forEach((swapOutInstanceName, value) -> { - String swapInInstanceName = swapInInstancesByLogicalId.get(value); - if (swapInInstanceName != null) { - newSwapOutInstanceNameToSwapInInstanceName.put(swapOutInstanceName, swapInInstanceName); + swapInLogicalIdsByInstanceName.forEach((swapInInstanceName, swapInLogicalId) -> { + String swapOutInstanceName = nonSwapInInstancesByLogicalId.get(swapInLogicalId); + if (swapOutInstanceName != null) { + newSwapOutInstanceNameToSwapOutInstanceName.put(swapOutInstanceName, swapInInstanceName); if (liveInstancesMap.containsKey(swapInInstanceName)) { newLiveSwapInInstanceNames.add(swapInInstanceName); } - if (InstanceValidationUtil.isInstanceEnabled(instanceConfigMap.get(swapInInstanceName), - clusterConfig)) { - newEnabledSwapInInstanceNames.add(swapInInstanceName); - } } }); // Replace caches with up-to-date instance sets. - _derivedInstanceCache = - new DerivedInstanceCache(newAssignableInstanceConfigMap, newAssignableLiveInstancesMap, - newSwapOutInstanceNameToSwapInInstanceName, newLiveSwapInInstanceNames, - newEnabledSwapInInstanceNames); + _derivedInstanceCache = new DerivedInstanceCache(newInstanceConfigMapByInstanceOperation, + newAssignableInstanceConfigMap, newAssignableLiveInstancesMap, + newSwapOutInstanceNameToSwapOutInstanceName, newLiveSwapInInstanceNames); } private void refreshResourceConfig(final HelixDataAccessor accessor, @@ -722,78 +702,50 @@ public Set getAllInstances() { } /** - * Return all the live nodes that are enabled and assignable + * Return a set of all instances that have the UNKNOWN InstanceOperation. + * These instances are not assignable and should have all replicas dropped + * immediately. * - * @return A new set contains live instance name and that are marked enabled + * @return A new set contains */ - public Set getAssignableEnabledLiveInstances() { - Set enabledLiveInstances = new HashSet<>(getAssignableLiveInstances().keySet()); - enabledLiveInstances.removeAll(getDisabledInstances()); - - return enabledLiveInstances; + public Set getUnknownInstances() { + return Collections.unmodifiableSet( + _derivedInstanceCache.getInstanceConfigMapByInstanceOperation( + InstanceConstants.InstanceOperation.UNKNOWN).keySet()); } /** - * Return all the live nodes that are enabled + * Return all the live nodes that are enabled. If a node is enabled, it is assignable. * @return A new set contains live instance name and that are marked enabled */ public Set getEnabledLiveInstances() { Set enabledLiveInstances = new HashSet<>(getLiveInstances().keySet()); - enabledLiveInstances.removeAll(getDisabledInstances()); + enabledLiveInstances.retainAll(getEnabledInstances()); return enabledLiveInstances; } /** - * Return all nodes that are enabled and assignable. - * - * @return A new set contains instance name and that are marked enabled - */ - public Set getAssignableEnabledInstances() { - Set enabledNodes = new HashSet<>(getAssignableInstances()); - enabledNodes.removeAll(getDisabledInstances()); - - return enabledNodes; - } - - /** - * Return all nodes that are enabled. + * Return all nodes that are enabled. If a node is enabled, it is assignable. * @return A new set contains instance name and that are marked enabled */ public Set getEnabledInstances() { - Set enabledNodes = new HashSet<>(getAllInstances()); - enabledNodes.removeAll(getDisabledInstances()); - - return enabledNodes; + return new HashSet<>(_derivedInstanceCache.getInstanceConfigMapByInstanceOperation( + InstanceConstants.InstanceOperation.ENABLE).keySet()); } /** - * Return all the live nodes that are enabled and assignable and tagged with given instanceTag. + * Return all the live nodes that are enabled and tagged with given instanceTag. If a node is + * enabled, it is assignable. * * @param instanceTag The instance group tag. * @return A new set contains live instance name and that are marked enabled and have the * specified tag. */ - public Set getAssignableEnabledLiveInstancesWithTag(String instanceTag) { - Set enabledLiveInstancesWithTag = new HashSet<>(getAssignableLiveInstances().keySet()); - Set instancesWithTag = getAssignableInstancesWithTag(instanceTag); - enabledLiveInstancesWithTag.retainAll(instancesWithTag); - enabledLiveInstancesWithTag.removeAll(getDisabledInstances()); - - return enabledLiveInstancesWithTag; - } - - /** - * Return all the live nodes that are enabled and tagged with given instanceTag. - * @param instanceTag The instance group tag. - * @return A new set contains live instance name and that are marked enabled and have the - * specified tag. - */ public Set getEnabledLiveInstancesWithTag(String instanceTag) { - Set enabledLiveInstancesWithTag = new HashSet<>(getLiveInstances().keySet()); + Set enabledLiveInstancesWithTag = new HashSet<>(getEnabledLiveInstances()); Set instancesWithTag = getAssignableInstancesWithTag(instanceTag); enabledLiveInstancesWithTag.retainAll(instancesWithTag); - enabledLiveInstancesWithTag.removeAll(getDisabledInstances()); return enabledLiveInstancesWithTag; } @@ -858,9 +810,9 @@ public Set getDisabledInstances() { } /** - * Get all swapping instance pairs. + * Get all swapping instance pairs keyed by swap-out instanceNames. * - * @return a map of SWAP_OUT instanceNames and their corresponding SWAP_IN instanceNames. + * @return a map of swap out instanceNames and their corresponding SWAP_IN instanceNames. */ public Map getSwapOutToSwapInInstancePairs() { return Collections.unmodifiableMap( @@ -868,21 +820,22 @@ public Map getSwapOutToSwapInInstancePairs() { } /** - * Get all the live SWAP_IN instances. + * Get all swapping instance pairs keyed by swap-in instanceNames. * - * @return a set of SWAP_IN instanceNames that have a corresponding SWAP_OUT instance. + * @return a map of swap in instanceNames and their corresponding swap out instanceNames. */ - public Set getLiveSwapInInstanceNames() { - return Collections.unmodifiableSet(_derivedInstanceCache._liveSwapInInstanceNames); + public Map getSwapInToSwapOutInstancePairs() { + return Collections.unmodifiableMap( + _derivedInstanceCache._swapInInstanceNameToSwapOutInstanceName); } /** - * Get all the enabled SWAP_IN instances. + * Get all the live SWAP_IN instances. * - * @return a set of SWAP_IN instanceNames that have a corresponding SWAP_OUT instance. + * @return a set of SWAP_IN instanceNames that have a corresponding swap out instance. */ - public Set getEnabledSwapInInstanceNames() { - return Collections.unmodifiableSet(_derivedInstanceCache._enabledSwapInInstanceNames); + public Set getLiveSwapInInstanceNames() { + return Collections.unmodifiableSet(_derivedInstanceCache._liveSwapInInstanceNames); } public synchronized void setLiveInstances(List liveInstances) { @@ -1127,7 +1080,8 @@ private void updateDisabledInstances(Collection allInstanceConfi _disabledInstanceSet.clear(); for (InstanceConfig config : allInstanceConfigs) { Map> disabledPartitionMap = config.getDisabledPartitionsMap(); - if (!InstanceValidationUtil.isInstanceEnabled(config, clusterConfig)) { + if (config.getInstanceOperation().getOperation() + .equals(InstanceConstants.InstanceOperation.DISABLE)) { _disabledInstanceSet.add(config.getInstanceName()); } for (String resource : disabledPartitionMap.keySet()) { diff --git a/helix-core/src/main/java/org/apache/helix/controller/rebalancer/AbstractRebalancer.java b/helix-core/src/main/java/org/apache/helix/controller/rebalancer/AbstractRebalancer.java index 160e4eda6f..477ef2032c 100644 --- a/helix-core/src/main/java/org/apache/helix/controller/rebalancer/AbstractRebalancer.java +++ b/helix-core/src/main/java/org/apache/helix/controller/rebalancer/AbstractRebalancer.java @@ -362,12 +362,19 @@ protected Map computeBestPossibleMap(List preferenceList } } + // TODO: Consider moving this logic to assignStatesToInstances since we are already passing + // disabledInstancesForPartition to that method. // (2) Set initial-state to certain instances that are disabled and in preference list. // Be careful with the conditions. for (String instance : preferenceList) { if (disabledInstancesForPartition.contains(instance)) { if (currentStateMap.containsKey(instance)) { - if (!currentStateMap.get(instance).equals(HelixDefinedState.ERROR.name())) { + if (currentStateMap.get(instance).equals(HelixDefinedState.ERROR.name())) { + // Must set to ERROR state here because assignStatesToInstances will not assign + // any state for disabledInstancesForPartition. This prevents the ERROR partition + // from being DROPPED. + bestPossibleStateMap.put(instance, HelixDefinedState.ERROR.name()); + } else { bestPossibleStateMap.put(instance, stateModelDef.getInitialState()); } } else { diff --git a/helix-core/src/main/java/org/apache/helix/controller/rebalancer/DelayedAutoRebalancer.java b/helix-core/src/main/java/org/apache/helix/controller/rebalancer/DelayedAutoRebalancer.java index 252b63255a..d55a6eae83 100644 --- a/helix-core/src/main/java/org/apache/helix/controller/rebalancer/DelayedAutoRebalancer.java +++ b/helix-core/src/main/java/org/apache/helix/controller/rebalancer/DelayedAutoRebalancer.java @@ -95,7 +95,7 @@ public IdealState computeNewIdealState(String resourceName, String instanceTag = currentIdealState.getInstanceGroupTag(); if (instanceTag != null) { - assignableLiveEnabledNodes = clusterData.getAssignableEnabledLiveInstancesWithTag(instanceTag); + assignableLiveEnabledNodes = clusterData.getEnabledLiveInstancesWithTag(instanceTag); assignableNodes = clusterData.getAssignableInstancesWithTag(instanceTag); if (LOG.isInfoEnabled()) { @@ -105,7 +105,7 @@ public IdealState computeNewIdealState(String resourceName, currentIdealState.getInstanceGroupTag(), resourceName, assignableNodes, assignableLiveEnabledNodes)); } } else { - assignableLiveEnabledNodes = clusterData.getAssignableEnabledLiveInstances(); + assignableLiveEnabledNodes = clusterData.getEnabledLiveInstances(); assignableNodes = clusterData.getAssignableInstances(); } @@ -246,7 +246,7 @@ public ResourceAssignment computeBestPossiblePartitionState(ResourceControllerDa LOG.debug("Processing resource:" + resource.getResourceName()); } - Set allNodes = cache.getAssignableEnabledInstances(); + Set allNodes = cache.getEnabledInstances(); Set liveNodes = cache.getAssignableLiveInstances().keySet(); ClusterConfig clusterConfig = cache.getClusterConfig(); @@ -268,7 +268,8 @@ public ResourceAssignment computeBestPossiblePartitionState(ResourceControllerDa Map bestStateForPartition = // We use cache.getLiveInstances().keySet() to make sure we gracefully handle n -> n + 1 replicas if possible // when the one of the current nodes holding the replica is no longer considered assignable. (ex: EVACUATE) - computeBestPossibleStateForPartition(cache.getLiveInstances().keySet(), stateModelDef, preferenceList, + computeBestPossibleStateForPartition(cache.getLiveInstances().keySet(), + stateModelDef, preferenceList, currentStateOutput, disabledInstancesForPartition, idealState, clusterConfig, partition, cache.getAbnormalStateResolver(stateModelDefName), cache); @@ -328,6 +329,7 @@ protected Map computeBestPossibleStateForPartition(Set l while (it.hasNext()) { String instance = it.next(); String state = currentStateMap.get(instance); + // TODO: This may never be a possible case, figure out if we can safely remove this. if (state == null) { it.remove(); instancesToDrop.add(instance); // These instances should be set to DROPPED after we get bestPossibleStateMap; @@ -405,6 +407,8 @@ protected Map computeBestPossibleStateForPartition(Set l } } + // TODO: This may not be necessary, all of the instances bestPossibleStateMap should be set to ERROR + // if necessary in the call to computeBestPossibleMap. // Adding ERROR replica mapping to best possible // ERROR assignment should be mutual excluded from DROPPED assignment because // once there is an ERROR replica in the mapping, bestPossibleStateMap.size() > numReplicas prevents diff --git a/helix-core/src/main/java/org/apache/helix/controller/rebalancer/topology/Topology.java b/helix-core/src/main/java/org/apache/helix/controller/rebalancer/topology/Topology.java index 335c30fdf2..2618275b13 100644 --- a/helix-core/src/main/java/org/apache/helix/controller/rebalancer/topology/Topology.java +++ b/helix-core/src/main/java/org/apache/helix/controller/rebalancer/topology/Topology.java @@ -205,7 +205,7 @@ private Node createClusterTree(ClusterConfig clusterConfig, boolean faultZoneLev } addEndNode(root, instanceName, instanceTopologyMap, weight, _liveInstances); } catch (IllegalArgumentException e) { - if (InstanceValidationUtil.isInstanceEnabled(insConfig, clusterConfig)) { + if (insConfig.getInstanceEnabled()) { throw e; } else { logger.warn("Topology setting {} for instance {} is unset or invalid, ignore the instance!", diff --git a/helix-core/src/main/java/org/apache/helix/controller/rebalancer/util/DelayedRebalanceUtil.java b/helix-core/src/main/java/org/apache/helix/controller/rebalancer/util/DelayedRebalanceUtil.java index 2796064db0..90da408b1a 100644 --- a/helix-core/src/main/java/org/apache/helix/controller/rebalancer/util/DelayedRebalanceUtil.java +++ b/helix-core/src/main/java/org/apache/helix/controller/rebalancer/util/DelayedRebalanceUtil.java @@ -29,19 +29,16 @@ import java.util.stream.Collectors; import org.apache.helix.HelixManager; -import org.apache.helix.constants.InstanceConstants; import org.apache.helix.controller.dataproviders.ResourceControllerDataProvider; import org.apache.helix.controller.rebalancer.waged.model.AssignableReplica; import org.apache.helix.controller.rebalancer.waged.model.ClusterModelProvider; import org.apache.helix.model.ClusterConfig; -import org.apache.helix.model.ClusterTopologyConfig; import org.apache.helix.model.IdealState; import org.apache.helix.model.InstanceConfig; import org.apache.helix.model.Partition; import org.apache.helix.model.ResourceAssignment; import org.apache.helix.model.ResourceConfig; import org.apache.helix.util.InstanceValidationUtil; -import org.apache.helix.zookeeper.datamodel.ZNRecord; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -157,7 +154,7 @@ private static long getInactiveTime(String instance, Set liveInstances, } // check the time instance got disabled. - if (!InstanceValidationUtil.isInstanceEnabled(instanceConfig, clusterConfig)) { + if (!instanceConfig.getInstanceEnabled()) { long disabledTime = instanceConfig.getInstanceEnabledTime(); String batchedDisabledTime = clusterConfig.getInstanceHelixDisabledTimeStamp(instance); if (batchedDisabledTime != null && !batchedDisabledTime.isEmpty()) { @@ -409,7 +406,7 @@ private static List findPartitionsMissingMinActiveReplica( ResourceAssignment resourceAssignment) { String resourceName = resourceAssignment.getResourceName(); IdealState currentIdealState = clusterData.getIdealState(resourceName); - Set enabledLiveInstances = clusterData.getAssignableEnabledLiveInstances(); + Set enabledLiveInstances = clusterData.getEnabledLiveInstances(); int numReplica = currentIdealState.getReplicaCount(enabledLiveInstances.size()); int minActiveReplica = DelayedRebalanceUtil.getMinActiveReplica(ResourceConfig .mergeIdealStateWithResourceConfig(clusterData.getResourceConfig(resourceName), @@ -430,7 +427,7 @@ private static List findPartitionsMissingMinActiveReplica( private static int getMinActiveReplica(ResourceControllerDataProvider clusterData, String resourceName) { IdealState currentIdealState = clusterData.getIdealState(resourceName); - Set enabledLiveInstances = clusterData.getAssignableEnabledLiveInstances(); + Set enabledLiveInstances = clusterData.getEnabledLiveInstances(); int numReplica = currentIdealState.getReplicaCount(enabledLiveInstances.size()); return DelayedRebalanceUtil.getMinActiveReplica(ResourceConfig .mergeIdealStateWithResourceConfig(clusterData.getResourceConfig(resourceName), diff --git a/helix-core/src/main/java/org/apache/helix/controller/rebalancer/waged/WagedRebalancer.java b/helix-core/src/main/java/org/apache/helix/controller/rebalancer/waged/WagedRebalancer.java index ae7e49a1d5..39a197bff5 100644 --- a/helix-core/src/main/java/org/apache/helix/controller/rebalancer/waged/WagedRebalancer.java +++ b/helix-core/src/main/java/org/apache/helix/controller/rebalancer/waged/WagedRebalancer.java @@ -304,7 +304,7 @@ private Map computeBestPossibleStates( Set activeNodes = DelayedRebalanceUtil.getActiveNodes(clusterData.getAssignableInstances(), - clusterData.getAssignableEnabledLiveInstances(), + clusterData.getEnabledLiveInstances(), clusterData.getInstanceOfflineTimeMap(), clusterData.getAssignableLiveInstances().keySet(), clusterData.getAssignableInstanceConfigMap(), clusterData.getClusterConfig()); @@ -401,7 +401,7 @@ private Map handleDelayedRebalanceMinActiveReplica( RebalanceAlgorithm algorithm) throws HelixRebalanceException { // the "real" live nodes at the time - final Set enabledLiveInstances = clusterData.getAssignableEnabledLiveInstances(); + final Set enabledLiveInstances = clusterData.getEnabledLiveInstances(); if (activeNodes.equals(enabledLiveInstances) || !requireRebalanceOverwrite(clusterData, currentResourceAssignment)) { // no need for additional process, return the current resource assignment @@ -602,7 +602,7 @@ private void delayedRebalanceSchedule(ResourceControllerDataProvider clusterData ClusterConfig clusterConfig = clusterData.getClusterConfig(); boolean delayedRebalanceEnabled = DelayedRebalanceUtil.isDelayRebalanceEnabled(clusterConfig); Set offlineOrDisabledInstances = new HashSet<>(delayedActiveNodes); - offlineOrDisabledInstances.removeAll(clusterData.getAssignableEnabledLiveInstances()); + offlineOrDisabledInstances.removeAll(clusterData.getEnabledLiveInstances()); for (String resource : resourceSet) { DelayedRebalanceUtil .setRebalanceScheduler(resource, delayedRebalanceEnabled, offlineOrDisabledInstances, @@ -623,7 +623,7 @@ protected boolean requireRebalanceOverwrite(ResourceControllerDataProvider clust String resourceName = resourceAssignment.getResourceName(); IdealState currentIdealState = clusterData.getIdealState(resourceName); - Set enabledLiveInstances = clusterData.getAssignableEnabledLiveInstances(); + Set enabledLiveInstances = clusterData.getEnabledLiveInstances(); int numReplica = currentIdealState.getReplicaCount(enabledLiveInstances.size()); int minActiveReplica = DelayedRebalanceUtil.getMinActiveReplica(ResourceConfig diff --git a/helix-core/src/main/java/org/apache/helix/controller/rebalancer/waged/model/ClusterModelProvider.java b/helix-core/src/main/java/org/apache/helix/controller/rebalancer/waged/model/ClusterModelProvider.java index ddd9880c0b..75151d3363 100644 --- a/helix-core/src/main/java/org/apache/helix/controller/rebalancer/waged/model/ClusterModelProvider.java +++ b/helix-core/src/main/java/org/apache/helix/controller/rebalancer/waged/model/ClusterModelProvider.java @@ -164,7 +164,7 @@ public static ClusterModel generateClusterModelFromExistingAssignment( ResourceControllerDataProvider dataProvider, Map resourceMap, Map currentStateAssignment) { return generateClusterModel(dataProvider, resourceMap, - dataProvider.getAssignableEnabledLiveInstances(), Collections.emptyMap(), + dataProvider.getEnabledLiveInstances(), Collections.emptyMap(), Collections.emptyMap(), currentStateAssignment, RebalanceScopeType.GLOBAL_BASELINE); } diff --git a/helix-core/src/main/java/org/apache/helix/controller/stages/AttributeName.java b/helix-core/src/main/java/org/apache/helix/controller/stages/AttributeName.java index 8771bff64b..0db5252ee0 100644 --- a/helix-core/src/main/java/org/apache/helix/controller/stages/AttributeName.java +++ b/helix-core/src/main/java/org/apache/helix/controller/stages/AttributeName.java @@ -24,6 +24,7 @@ public enum AttributeName { RESOURCES_TO_REBALANCE, BEST_POSSIBLE_STATE, CURRENT_STATE, + CURRENT_STATE_EXCLUDING_UNKNOWN, CUSTOMIZED_STATE, INTERMEDIATE_STATE, MESSAGES_ALL, diff --git a/helix-core/src/main/java/org/apache/helix/controller/stages/BestPossibleStateCalcStage.java b/helix-core/src/main/java/org/apache/helix/controller/stages/BestPossibleStateCalcStage.java index 2a6f9644e9..714e9325d1 100644 --- a/helix-core/src/main/java/org/apache/helix/controller/stages/BestPossibleStateCalcStage.java +++ b/helix-core/src/main/java/org/apache/helix/controller/stages/BestPossibleStateCalcStage.java @@ -30,9 +30,11 @@ import java.util.concurrent.Callable; import java.util.stream.Collectors; +import org.apache.helix.HelixDefinedState; import org.apache.helix.HelixException; import org.apache.helix.HelixManager; import org.apache.helix.HelixRebalanceException; +import org.apache.helix.constants.InstanceConstants; import org.apache.helix.controller.LogUtil; import org.apache.helix.controller.dataproviders.ResourceControllerDataProvider; import org.apache.helix.controller.pipeline.AbstractBaseStage; @@ -74,7 +76,8 @@ public class BestPossibleStateCalcStage extends AbstractBaseStage { @Override public void process(ClusterEvent event) throws Exception { _eventId = event.getEventId(); - CurrentStateOutput currentStateOutput = event.getAttribute(AttributeName.CURRENT_STATE.name()); + CurrentStateOutput currentStateOutput = + event.getAttribute(AttributeName.CURRENT_STATE_EXCLUDING_UNKNOWN.name()); final Map resourceMap = event.getAttribute(AttributeName.RESOURCES_TO_REBALANCE.name()); final ClusterStatusMonitor clusterStatusMonitor = @@ -83,8 +86,8 @@ public void process(ClusterEvent event) throws Exception { event.getAttribute(AttributeName.ControllerDataProvider.name()); if (currentStateOutput == null || resourceMap == null || cache == null) { - throw new StageException( - "Missing attributes in event:" + event + ". Requires CURRENT_STATE|RESOURCES|DataCache"); + throw new StageException("Missing attributes in event:" + event + + ". Requires CURRENT_STATE_EXCLUDING_UNKNOWN|RESOURCES|DataCache"); } final BestPossibleStateOutput bestPossibleStateOutput = @@ -131,82 +134,104 @@ public void process(ClusterEvent event) throws Exception { }); } + private String selectSwapInState(StateModelDefinition stateModelDef, Map stateMap, + String swapOutInstance) { + // If the swap-in node is live, select state with the following logic: + // 1. If the swap-out instance's replica is in the stateMap: + // - if the swap-out instance's replica is a topState, select the swap-in instance's replica to the topState. + // if another is allowed to be added, otherwise select the swap-in instance's replica to a secondTopState. + // - if the swap-out instance's replica is not a topState or ERROR, select the swap-in instance's replica to the same state. + // - if the swap-out instance's replica is ERROR, select the swap-in instance's replica to the initialState. + // 2. If the swap-out instance's replica is not in the stateMap, select the swap-in instance's replica to the initialState. + // This happens when the swap-out node is offline. + if (stateMap.containsKey(swapOutInstance)) { + if (stateMap.get(swapOutInstance).equals(stateModelDef.getTopState()) || stateMap.get( + swapOutInstance).equals(HelixDefinedState.ERROR.name())) { + // If the swap-out instance's replica is a topState, select the swap-in instance's replica + // to be the topState if the StateModel allows another to be added. If not, select the swap-in + // to be the secondTopState. + String topStateCount = stateModelDef.getNumInstancesPerState(stateModelDef.getTopState()); + if (topStateCount.equals(StateModelDefinition.STATE_REPLICA_COUNT_ALL_CANDIDATE_NODES) + || topStateCount.equals(StateModelDefinition.STATE_REPLICA_COUNT_ALL_REPLICAS)) { + // If the StateModel allows for another replica with the topState to be added, + // select the swap-in instance's replica to the topState. + return stateModelDef.getTopState(); + } + // If StateModel does not allow another topState replica to be + // added, select the swap-in instance's replica to be the secondTopState. + return stateModelDef.getSecondTopStates().iterator().next(); + } + // If the swap-out instance's replica is not a topState or ERROR, select the swap-in instance's replica + // to be the same state + return stateMap.get(swapOutInstance); + } + // If the swap-out instance's replica is not in the stateMap, return null + return null; + } + private void addSwapInInstancesToBestPossibleState(Map resourceMap, BestPossibleStateOutput bestPossibleStateOutput, ResourceControllerDataProvider cache) { - // 1. Get all SWAP_OUT instances and corresponding SWAP_IN instance pairs in the cluster. + // 1. Get all swap out instances and corresponding SWAP_IN instance pairs in the cluster. Map swapOutToSwapInInstancePairs = cache.getSwapOutToSwapInInstancePairs(); - // 2. Get all enabled and live SWAP_IN instances in the cluster. + Map swapInToSwapOutInstancePairs = cache.getSwapInToSwapOutInstancePairs(); + + // 2. Get all live SWAP_IN instances in the cluster. Set liveSwapInInstances = cache.getLiveSwapInInstanceNames(); - Set enabledSwapInInstances = cache.getEnabledSwapInInstanceNames(); - // 3. For each SWAP_OUT instance in any of the preferenceLists, add the corresponding SWAP_IN instance to - // the stateMap with the correct state. - // Skipping this when there are no SWAP_IN instances that are alive will reduce computation time. - if (!liveSwapInInstances.isEmpty() && !cache.isMaintenanceModeEnabled()) { - resourceMap.forEach((resourceName, resource) -> { - StateModelDefinition stateModelDef = cache.getStateModelDef(resource.getStateModelDefRef()); - bestPossibleStateOutput.getResourceStatesMap().get(resourceName).getStateMap() - .forEach((partition, stateMap) -> { - // We use the preferenceList for the case where the swapOutInstance goes offline. - // We do not want to drop the replicas that may have been bootstrapped on the swapInInstance - // in the case that the swapOutInstance goes offline and no longer has an entry in the stateMap. - Set commonInstances = new HashSet<>( - bestPossibleStateOutput.getPreferenceList(resourceName, - partition.getPartitionName())); - commonInstances.retainAll(swapOutToSwapInInstancePairs.keySet()); - - commonInstances.forEach(swapOutInstance -> { - // If the corresponding swap-in instance is not live, skip assigning to it. - if (!liveSwapInInstances.contains( - swapOutToSwapInInstancePairs.get(swapOutInstance))) { - return; - } - - // If the corresponding swap-in instance is not enabled, assign replicas with - // initial state. - if (!enabledSwapInInstances.contains( - swapOutToSwapInInstancePairs.get(swapOutInstance))) { - stateMap.put(swapOutToSwapInInstancePairs.get(swapOutInstance), - stateModelDef.getInitialState()); - return; - } - - // If the swap-in node is live and enabled, do assignment with the following logic: - // 1. If the swap-out instance's replica is a secondTopState, set the swap-in instance's replica - // to the same secondTopState. - // 2. If the swap-out instance's replica is any other state and is in the preferenceList, - // set the swap-in instance's replica to the topState if the StateModel allows another to be added. - // If not, set the swap-in instance's replica to the secondTopState. - // We can make this assumption because if there is assignment to the swapOutInstance, it must be either - // a topState or a secondTopState. - if (stateMap.containsKey(swapOutInstance) && stateModelDef.getSecondTopStates() - .contains(stateMap.get(swapOutInstance))) { - // If the swap-out instance's replica is a secondTopState, set the swap-in instance's replica - // to the same secondTopState. - stateMap.put(swapOutToSwapInInstancePairs.get(swapOutInstance), - stateMap.get(swapOutInstance)); - } else { - // If the swap-out instance's replica is any other state in the stateMap or not present in the - // stateMap, set the swap-in instance's replica to the topState if the StateModel allows another - // to be added. If not, set the swap-in to the secondTopState. - String topStateCount = - stateModelDef.getNumInstancesPerState(stateModelDef.getTopState()); - if (topStateCount.equals( - StateModelDefinition.STATE_REPLICA_COUNT_ALL_CANDIDATE_NODES) - || topStateCount.equals( - StateModelDefinition.STATE_REPLICA_COUNT_ALL_REPLICAS)) { - // If the StateModel allows for another replica with the topState to be added, - // set the swap-in instance's replica to the topState. - stateMap.put(swapOutToSwapInInstancePairs.get(swapOutInstance), - stateModelDef.getTopState()); - } else { - // If StateModel does not allow another topState replica to be - // added, set the swap-in instance's replica to the secondTopState. - stateMap.put(swapOutToSwapInInstancePairs.get(swapOutInstance), - stateModelDef.getSecondTopStates().iterator().next()); - } - } - }); + if (liveSwapInInstances.isEmpty() || cache.isMaintenanceModeEnabled()) { + return; + } + + // 3. Find the assignment for each swap-in instance + // : : + Map>> swapInInstanceAssignment = new HashMap<>(); + resourceMap.forEach((resourceName, resource) -> { + bestPossibleStateOutput.getResourceStatesMap().get(resourceName).getStateMap() + .forEach((partition, stateMap) -> { + // We use the preferenceList for the case where the swapOutInstance goes offline. + // We do not want to drop the replicas that may have been bootstrapped on the swapInInstance + // in the case that the swapOutInstance goes offline and no longer has an entry in the stateMap. + Set commonInstances = + bestPossibleStateOutput.getInstanceStateMap(resourceName, partition) != null + ? new HashSet<>( + bestPossibleStateOutput.getInstanceStateMap(resourceName, partition).keySet()) + : Collections.emptySet(); + if (commonInstances.isEmpty()) { + return; + } + commonInstances.retainAll(swapOutToSwapInInstancePairs.keySet()); + + commonInstances.forEach(swapOutInstance -> { + swapInInstanceAssignment.computeIfAbsent( + swapOutToSwapInInstancePairs.get(swapOutInstance), k -> new HashMap<>()) + .computeIfAbsent(resourceName, k -> new HashSet<>()) + .add(partition.getPartitionName()); }); + }); + }); + + // 4. Add the correct states for the swap-in instances to the bestPossibleStateOutput. + if (!swapInInstanceAssignment.isEmpty()) { + swapInInstanceAssignment.forEach((swapInInstance, resourceMapForInstance) -> { + // If the corresponding swap-in instance is not live, skip assigning to it. + if (!liveSwapInInstances.contains(swapInInstance)) { + return; + } + + resourceMapForInstance.forEach((resourceName, partitions) -> { + partitions.forEach(partitionName -> { + Partition partition = new Partition(partitionName); + Map stateMap = + bestPossibleStateOutput.getInstanceStateMap(resourceName, partition); + + String selectedState = selectSwapInState( + cache.getStateModelDef(resourceMap.get(resourceName).getStateModelDefRef()), + stateMap, swapInToSwapOutInstancePairs.get(swapInInstance)); + if (stateMap != null) { + bestPossibleStateOutput.setState(resourceName, partition, swapInInstance, + selectedState); + } + }); + }); }); } } @@ -250,7 +275,7 @@ private BestPossibleStateOutput compute(ClusterEvent event, Map failureResources = new ArrayList<>(); @@ -323,25 +348,33 @@ public Object call() { }); } - // Check whether the offline/disabled instance count in the cluster reaches the set limit, + // Check whether the offline/unable to accept online replicas instance count in the cluster reaches the set limit, // if yes, auto enable maintenance mode, and use the maintenance rebalancer for this pipeline. - private boolean validateOfflineInstancesLimit(final ResourceControllerDataProvider cache, + private boolean validateInstancesUnableToAcceptOnlineReplicasLimit(final ResourceControllerDataProvider cache, final HelixManager manager) { - int maxOfflineInstancesAllowed = cache.getClusterConfig().getMaxOfflineInstancesAllowed(); - if (maxOfflineInstancesAllowed >= 0) { - int offlineCount = - cache.getAssignableInstances().size() - cache.getAssignableEnabledLiveInstances().size(); - if (offlineCount > maxOfflineInstancesAllowed) { + int maxInstancesUnableToAcceptOnlineReplicas = + cache.getClusterConfig().getMaxOfflineInstancesAllowed(); + if (maxInstancesUnableToAcceptOnlineReplicas >= 0) { + // Instead of only checking the offline instances, we consider how many instances in the cluster + // are not assignable and live. This is because some instances may be online but have an unassignable + // InstanceOperation such as EVACUATE, and DISABLE. We will exclude SWAP_IN and UNKNOWN instances from + // they should not account against the capacity of the cluster. + int instancesUnableToAcceptOnlineReplicas = cache.getInstanceConfigMap().entrySet().stream() + .filter(instanceEntry -> !InstanceConstants.UNROUTABLE_INSTANCE_OPERATIONS.contains( + instanceEntry.getValue().getInstanceOperation().getOperation())) + .collect(Collectors.toSet()) + .size() - cache.getEnabledLiveInstances().size(); + if (instancesUnableToAcceptOnlineReplicas > maxInstancesUnableToAcceptOnlineReplicas) { String errMsg = String.format( - "Offline Instances count %d greater than allowed count %d. Put cluster %s into " - + "maintenance mode.", - offlineCount, maxOfflineInstancesAllowed, cache.getClusterName()); + "Instances unable to take ONLINE replicas count %d greater than allowed count %d. Put cluster %s into " + + "maintenance mode.", instancesUnableToAcceptOnlineReplicas, + maxInstancesUnableToAcceptOnlineReplicas, cache.getClusterName()); if (manager != null) { if (manager.getHelixDataAccessor() .getProperty(manager.getHelixDataAccessor().keyBuilder().maintenance()) == null) { manager.getClusterManagmentTool() .autoEnableMaintenanceMode(manager.getClusterName(), true, errMsg, - MaintenanceSignal.AutoTriggerReason.MAX_OFFLINE_INSTANCES_EXCEEDED); + MaintenanceSignal.AutoTriggerReason.MAX_INSTANCES_UNABLE_TO_ACCEPT_ONLINE_REPLICAS); LogUtil.logWarn(logger, _eventId, errMsg); } } else { diff --git a/helix-core/src/main/java/org/apache/helix/controller/stages/CurrentStateComputationStage.java b/helix-core/src/main/java/org/apache/helix/controller/stages/CurrentStateComputationStage.java index 6fbb2b63e5..da972d682c 100644 --- a/helix-core/src/main/java/org/apache/helix/controller/stages/CurrentStateComputationStage.java +++ b/helix-core/src/main/java/org/apache/helix/controller/stages/CurrentStateComputationStage.java @@ -28,6 +28,7 @@ import java.util.concurrent.ExecutorService; import java.util.stream.Collectors; +import org.apache.helix.constants.InstanceConstants; import org.apache.helix.controller.LogUtil; import org.apache.helix.controller.dataproviders.BaseControllerDataProvider; import org.apache.helix.controller.dataproviders.ResourceControllerDataProvider; @@ -43,6 +44,7 @@ import org.apache.helix.controller.rebalancer.waged.model.ClusterModelProvider; import org.apache.helix.model.CurrentState; import org.apache.helix.model.IdealState; +import org.apache.helix.model.InstanceConfig; import org.apache.helix.model.LiveInstance; import org.apache.helix.model.Message; import org.apache.helix.model.Message.MessageType; @@ -86,24 +88,40 @@ public void process(ClusterEvent event) throws Exception { Map liveInstances = cache.getLiveInstances(); final CurrentStateOutput currentStateOutput = new CurrentStateOutput(); + final CurrentStateOutput currentStateExcludingUnknown = new CurrentStateOutput(); for (LiveInstance instance : liveInstances.values()) { String instanceName = instance.getInstanceName(); String instanceSessionId = instance.getEphemeralOwner(); + InstanceConfig instanceConfig = cache.getInstanceConfigMap().get(instanceName); + + Set existingStaleMessages = cache.getStaleMessagesByInstance(instanceName); + Map messages = cache.getMessages(instanceName); + Map relayMessages = cache.getRelayMessages(instanceName); // update current states. updateCurrentStates(instance, cache.getCurrentState(instanceName, instanceSessionId, _isTaskFrameworkPipeline).values(), currentStateOutput, resourceMap); - - Set existingStaleMessages = cache.getStaleMessagesByInstance(instanceName); // update pending messages - Map messages = cache.getMessages(instanceName); - Map relayMessages = cache.getRelayMessages(instanceName); updatePendingMessages(instance, cache, messages.values(), relayMessages.values(), existingStaleMessages, currentStateOutput, resourceMap); + + // Only update the currentStateExcludingUnknown if the instance is not in UNKNOWN InstanceOperation. + if (instanceConfig == null || !instanceConfig.getInstanceOperation() + .getOperation() + .equals(InstanceConstants.InstanceOperation.UNKNOWN)) { + // update current states. + updateCurrentStates(instance, + cache.getCurrentState(instanceName, instanceSessionId, _isTaskFrameworkPipeline) + .values(), currentStateExcludingUnknown, resourceMap); + // update pending messages + updatePendingMessages(instance, cache, messages.values(), relayMessages.values(), + existingStaleMessages, currentStateExcludingUnknown, resourceMap); + } } event.addAttribute(AttributeName.CURRENT_STATE.name(), currentStateOutput); + event.addAttribute(AttributeName.CURRENT_STATE_EXCLUDING_UNKNOWN.name(), currentStateExcludingUnknown); final ClusterStatusMonitor clusterStatusMonitor = event.getAttribute(AttributeName.clusterStatusMonitor.name()); diff --git a/helix-core/src/main/java/org/apache/helix/controller/stages/IntermediateStateCalcStage.java b/helix-core/src/main/java/org/apache/helix/controller/stages/IntermediateStateCalcStage.java index b3990046c0..ba2e16018f 100644 --- a/helix-core/src/main/java/org/apache/helix/controller/stages/IntermediateStateCalcStage.java +++ b/helix-core/src/main/java/org/apache/helix/controller/stages/IntermediateStateCalcStage.java @@ -661,11 +661,11 @@ private Map getRequiredStates(String resourceName, // preference list if (preferenceList != null) { return stateModelDefinition.getStateCountMap((int) preferenceList.stream().filter( - i -> resourceControllerDataProvider.getAssignableEnabledLiveInstances().contains(i)) + i -> resourceControllerDataProvider.getEnabledLiveInstances().contains(i)) .count(), requiredNumReplica); // StateModelDefinition's counts } return stateModelDefinition.getStateCountMap( - resourceControllerDataProvider.getAssignableEnabledLiveInstances().size(), + resourceControllerDataProvider.getEnabledLiveInstances().size(), requiredNumReplica); // StateModelDefinition's counts } diff --git a/helix-core/src/main/java/org/apache/helix/controller/stages/MaintenanceRecoveryStage.java b/helix-core/src/main/java/org/apache/helix/controller/stages/MaintenanceRecoveryStage.java index d262d14023..1a5185a052 100644 --- a/helix-core/src/main/java/org/apache/helix/controller/stages/MaintenanceRecoveryStage.java +++ b/helix-core/src/main/java/org/apache/helix/controller/stages/MaintenanceRecoveryStage.java @@ -82,6 +82,7 @@ public void execute(final ClusterEvent event) throws Exception { String reason; switch (internalReason) { case MAX_OFFLINE_INSTANCES_EXCEEDED: + case MAX_INSTANCES_UNABLE_TO_ACCEPT_ONLINE_REPLICAS: // Check on the number of offline/disabled instances int numOfflineInstancesForAutoExit = cache.getClusterConfig().getNumOfflineInstancesForAutoExit(); @@ -90,7 +91,7 @@ public void execute(final ClusterEvent event) throws Exception { } // Get the count of all instances that are either offline or disabled int offlineDisabledCount = - cache.getAssignableInstances().size() - cache.getAssignableEnabledLiveInstances().size(); + cache.getAssignableInstances().size() - cache.getEnabledLiveInstances().size(); shouldExitMaintenance = offlineDisabledCount <= numOfflineInstancesForAutoExit; reason = String.format( "Auto-exiting maintenance mode for cluster %s; Num. of offline/disabled instances is %d, less than or equal to the exit threshold %d", diff --git a/helix-core/src/main/java/org/apache/helix/controller/stages/MessageGenerationPhase.java b/helix-core/src/main/java/org/apache/helix/controller/stages/MessageGenerationPhase.java index 5c22c11dba..859c6679e9 100644 --- a/helix-core/src/main/java/org/apache/helix/controller/stages/MessageGenerationPhase.java +++ b/helix-core/src/main/java/org/apache/helix/controller/stages/MessageGenerationPhase.java @@ -135,6 +135,8 @@ private void generateMessage(final Resource resource, final BaseControllerDataPr new HashMap<>(resourcesStateMap.getInstanceStateMap(resourceName, partition)); Map pendingStateMap = currentStateOutput.getPendingStateMap(resourceName, partition); + Map currentStateMap = + currentStateOutput.getCurrentStateMap(resourceName, partition); // The operation is combing pending state with best possible state. Since some replicas have // been moved from one instance to another, the instance will exist in pending state but not @@ -146,6 +148,16 @@ private void generateMessage(final Resource resource, final BaseControllerDataPr } } + // Look through the current state map and add DROPPED message if the instance is not in the + // resourceStateMap. This instance may not have had been dropped by the rebalance strategy. + // This check is required to ensure that the instances removed from the ideal state stateMap + // are properly dropped. + for (String instance : currentStateMap.keySet()) { + if (!instanceStateMap.containsKey(instance)) { + instanceStateMap.put(instance, HelixDefinedState.DROPPED.name()); + } + } + // we should generate message based on the desired-state priority // so keep generated messages in a temp map keyed by state // desired-state->list of generated-messages diff --git a/helix-core/src/main/java/org/apache/helix/controller/stages/ReadClusterDataStage.java b/helix-core/src/main/java/org/apache/helix/controller/stages/ReadClusterDataStage.java index 7e8bde9d1b..6a0ae76fc5 100644 --- a/helix-core/src/main/java/org/apache/helix/controller/stages/ReadClusterDataStage.java +++ b/helix-core/src/main/java/org/apache/helix/controller/stages/ReadClusterDataStage.java @@ -90,7 +90,7 @@ public void process(ClusterEvent event) throws Exception { instanceMessageMap.put(instanceName, Sets.newHashSet(dataProvider.getMessages(instanceName).values())); } - if (!InstanceValidationUtil.isInstanceEnabled(config, clusterConfig)) { + if (!config.getInstanceEnabled()) { disabledInstanceSet.add(instanceName); } diff --git a/helix-core/src/main/java/org/apache/helix/examples/IdealStateBuilderExample.java b/helix-core/src/main/java/org/apache/helix/examples/IdealStateBuilderExample.java index fd91588359..036a548bbb 100644 --- a/helix-core/src/main/java/org/apache/helix/examples/IdealStateBuilderExample.java +++ b/helix-core/src/main/java/org/apache/helix/examples/IdealStateBuilderExample.java @@ -19,6 +19,7 @@ * under the License. */ +import org.apache.helix.constants.InstanceConstants; import org.apache.helix.controller.HelixControllerMain; import org.apache.helix.manager.zk.ZKHelixAdmin; import org.apache.helix.manager.zk.ZNRecordSerializer; @@ -73,7 +74,7 @@ public static void main(String[] args) { InstanceConfig config = new InstanceConfig("localhost_" + port); config.setHostName("localhost"); config.setPort(Integer.toString(port)); - config.setInstanceEnabled(true); + config.setInstanceOperation(InstanceConstants.InstanceOperation.ENABLE); admin.addInstance(clusterName, config); } diff --git a/helix-core/src/main/java/org/apache/helix/examples/IdealStateExample.java b/helix-core/src/main/java/org/apache/helix/examples/IdealStateExample.java index 1f3939c7e4..fa5b7cd72a 100644 --- a/helix-core/src/main/java/org/apache/helix/examples/IdealStateExample.java +++ b/helix-core/src/main/java/org/apache/helix/examples/IdealStateExample.java @@ -19,6 +19,7 @@ * under the License. */ +import org.apache.helix.constants.InstanceConstants; import org.apache.helix.controller.HelixControllerMain; import org.apache.helix.manager.zk.ZKHelixAdmin; import org.apache.helix.manager.zk.ZNRecordSerializer; @@ -113,7 +114,7 @@ public static void main(String[] args) throws Exception { InstanceConfig config = new InstanceConfig("localhost_" + port); config.setHostName("localhost"); config.setPort(Integer.toString(port)); - config.setInstanceEnabled(true); + config.setInstanceOperation(InstanceConstants.InstanceOperation.ENABLE); admin.addInstance(clusterName, config); } diff --git a/helix-core/src/main/java/org/apache/helix/examples/Quickstart.java b/helix-core/src/main/java/org/apache/helix/examples/Quickstart.java index 9cc14b6039..37fd1ac150 100644 --- a/helix-core/src/main/java/org/apache/helix/examples/Quickstart.java +++ b/helix-core/src/main/java/org/apache/helix/examples/Quickstart.java @@ -29,6 +29,7 @@ import org.apache.helix.HelixManager; import org.apache.helix.HelixManagerFactory; import org.apache.helix.InstanceType; +import org.apache.helix.constants.InstanceConstants; import org.apache.helix.controller.HelixControllerMain; import org.apache.helix.manager.zk.ZKHelixAdmin; import org.apache.helix.model.ExternalView; @@ -69,7 +70,7 @@ public class Quickstart { InstanceConfig instanceConfig = new InstanceConfig("localhost_" + port); instanceConfig.setHostName("localhost"); instanceConfig.setPort("" + port); - instanceConfig.setInstanceEnabled(true); + instanceConfig.setInstanceOperation(InstanceConstants.InstanceOperation.ENABLE); INSTANCE_CONFIG_LIST.add(instanceConfig); } @@ -190,7 +191,7 @@ private static void addNode() throws Exception { InstanceConfig instanceConfig = new InstanceConfig("localhost_" + port); instanceConfig.setHostName("localhost"); instanceConfig.setPort("" + port); - instanceConfig.setInstanceEnabled(true); + instanceConfig.setInstanceOperation(InstanceConstants.InstanceOperation.ENABLE); echo("ADDING NEW NODE :" + instanceConfig.getInstanceName() + ". Partitions will move from old nodes to the new node."); admin.addInstance(CLUSTER_NAME, instanceConfig); diff --git a/helix-core/src/main/java/org/apache/helix/manager/zk/ZKHelixAdmin.java b/helix-core/src/main/java/org/apache/helix/manager/zk/ZKHelixAdmin.java index d52967f5c3..39ae9ae67c 100644 --- a/helix-core/src/main/java/org/apache/helix/manager/zk/ZKHelixAdmin.java +++ b/helix-core/src/main/java/org/apache/helix/manager/zk/ZKHelixAdmin.java @@ -42,7 +42,6 @@ import javax.annotation.Nullable; import com.google.common.collect.ImmutableSet; -import org.apache.commons.lang3.builder.EqualsBuilder; import org.apache.helix.AccessOption; import org.apache.helix.BaseDataAccessor; import org.apache.helix.ConfigAccessor; @@ -61,7 +60,6 @@ import org.apache.helix.api.status.ClusterManagementModeRequest; import org.apache.helix.api.topology.ClusterTopology; import org.apache.helix.constants.InstanceConstants; -import org.apache.helix.controller.rebalancer.DelayedAutoRebalancer; import org.apache.helix.controller.rebalancer.strategy.RebalanceStrategy; import org.apache.helix.controller.rebalancer.util.WagedValidationUtil; import org.apache.helix.controller.rebalancer.waged.WagedRebalancer; @@ -70,7 +68,6 @@ import org.apache.helix.model.ClusterConstraints; import org.apache.helix.model.ClusterConstraints.ConstraintType; import org.apache.helix.model.ClusterStatus; -import org.apache.helix.model.ClusterTopologyConfig; import org.apache.helix.model.ConstraintItem; import org.apache.helix.model.ControllerHistory; import org.apache.helix.model.CurrentState; @@ -90,11 +87,11 @@ import org.apache.helix.model.PauseSignal; import org.apache.helix.model.ResourceConfig; import org.apache.helix.model.StateModelDefinition; -import org.apache.helix.model.builder.HelixConfigScopeBuilder; import org.apache.helix.msdcommon.exception.InvalidRoutingDataException; import org.apache.helix.tools.DefaultIdealStateCalculator; import org.apache.helix.util.ConfigStringUtil; import org.apache.helix.util.HelixUtil; +import org.apache.helix.util.InstanceUtil; import org.apache.helix.util.RebalanceUtil; import org.apache.helix.zookeeper.api.client.HelixZkClient; import org.apache.helix.zookeeper.api.client.RealmAwareZkClient; @@ -119,13 +116,14 @@ public class ZKHelixAdmin implements HelixAdmin { public static final String CONNECTION_TIMEOUT = "helixAdmin.timeOutInSec"; private static final String MAINTENANCE_ZNODE_ID = "maintenance"; private static final int DEFAULT_SUPERCLUSTER_REPLICA = 3; - private static final ImmutableSet ALLOWED_INSTANCE_OPERATIONS_FOR_ADD_INSTANCE = - ImmutableSet.of("", InstanceConstants.InstanceOperation.SWAP_IN.name()); - private static final ImmutableSet INSTANCE_OPERATION_TO_EXCLUDE_FROM_ASSIGNMENT = - ImmutableSet.of(InstanceConstants.InstanceOperation.EVACUATE.name()); + private static final ImmutableSet + INSTANCE_OPERATION_TO_EXCLUDE_FROM_ASSIGNMENT = + ImmutableSet.of(InstanceConstants.InstanceOperation.EVACUATE, + InstanceConstants.InstanceOperation.UNKNOWN); private final RealmAwareZkClient _zkClient; private final ConfigAccessor _configAccessor; + private final BaseDataAccessor _baseDataAccessor; // true if ZKHelixAdmin was instantiated with a RealmAwareZkClient, false otherwise // This is used for close() to determine how ZKHelixAdmin should close the underlying ZkClient private final boolean _usesExternalZkClient; @@ -143,6 +141,7 @@ public class ZKHelixAdmin implements HelixAdmin { public ZKHelixAdmin(RealmAwareZkClient zkClient) { _zkClient = zkClient; _configAccessor = new ConfigAccessor(zkClient); + _baseDataAccessor = new ZkBaseDataAccessor<>(zkClient); _usesExternalZkClient = true; } @@ -183,12 +182,14 @@ public ZKHelixAdmin(String zkAddress) { _zkClient = zkClient; _configAccessor = new ConfigAccessor(_zkClient); + _baseDataAccessor = new ZkBaseDataAccessor<>(zkClient); _usesExternalZkClient = false; } private ZKHelixAdmin(RealmAwareZkClient zkClient, boolean usesExternalZkClient) { _zkClient = zkClient; _configAccessor = new ConfigAccessor(_zkClient); + _baseDataAccessor = new ZkBaseDataAccessor<>(zkClient); _usesExternalZkClient = usesExternalZkClient; } @@ -206,113 +207,29 @@ public void addInstance(String clusterName, InstanceConfig instanceConfig) { throw new HelixException("Node " + nodeId + " already exists in cluster " + clusterName); } - if (!ALLOWED_INSTANCE_OPERATIONS_FOR_ADD_INSTANCE.contains( - instanceConfig.getInstanceOperation())) { + List matchingLogicalIdInstances = + InstanceUtil.findInstancesWithMatchingLogicalId(_configAccessor, clusterName, + instanceConfig); + if (matchingLogicalIdInstances.size() > 1) { throw new HelixException( - "Instance can only be added if InstanceOperation is set to one of" + "the following: " - + ALLOWED_INSTANCE_OPERATIONS_FOR_ADD_INSTANCE + " This instance: " + nodeId - + " has InstanceOperation set to " + instanceConfig.getInstanceOperation()); + "There are already more than one instance with the same logicalId in the cluster: " + + matchingLogicalIdInstances.stream().map(InstanceConfig::getInstanceName) + .collect(Collectors.joining(", ")) + + " Please make sure there is at most 2 instance with the same logicalId in the cluster."); } - // Get the topology key used to determine the logicalId of a node. - ClusterConfig clusterConfig = _configAccessor.getClusterConfig(clusterName); - ClusterTopologyConfig clusterTopologyConfig = - ClusterTopologyConfig.createFromClusterConfig(clusterConfig); - String logicalIdKey = clusterTopologyConfig.getEndNodeType(); - String faultZoneKey = clusterTopologyConfig.getFaultZoneType(); - String toAddInstanceLogicalId = instanceConfig.getLogicalId(logicalIdKey); - - HelixConfigScope instanceConfigScope = - new HelixConfigScopeBuilder(HelixConfigScope.ConfigScopeProperty.PARTICIPANT, - clusterName).build(); - List existingInstanceIds = getConfigKeys(instanceConfigScope); - List foundInstanceConfigsWithMatchingLogicalId = - existingInstanceIds.parallelStream() - .map(existingInstanceId -> getInstanceConfig(clusterName, existingInstanceId)).filter( - existingInstanceConfig -> existingInstanceConfig.getLogicalId(logicalIdKey) - .equals(toAddInstanceLogicalId)).collect(Collectors.toList()); - - if (foundInstanceConfigsWithMatchingLogicalId.size() >= 2) { - // If the length is 2, we cannot add an instance with the same logicalId as an existing instance - // regardless of InstanceOperation. - throw new HelixException( - "There can only be 2 instances with the same logicalId in a cluster. " - + "Existing instances: " + foundInstanceConfigsWithMatchingLogicalId.get(0) - .getInstanceName() + " and " + foundInstanceConfigsWithMatchingLogicalId.get(1) - .getInstanceName() + " already have the same logicalId: " + toAddInstanceLogicalId - + "; therefore, " + nodeId + " cannot be added to the cluster."); - } else if (foundInstanceConfigsWithMatchingLogicalId.size() == 1) { - // If there is only one instance with the same logicalId, - // we can infer that the intended behaviour is to SWAP_IN or EVACUATE + ADD. - if (foundInstanceConfigsWithMatchingLogicalId.get(0).getInstanceOperation() - .equals(InstanceConstants.InstanceOperation.SWAP_OUT.name())) { - // If the existing instance with the same logicalId has SWAP_OUT InstanceOperation - - // If the InstanceOperation is unset, we will set it to SWAP_IN. - if (!instanceConfig.getInstanceOperation() - .equals(InstanceConstants.InstanceOperation.SWAP_IN.name())) { - instanceConfig.setInstanceOperation(InstanceConstants.InstanceOperation.SWAP_IN); - } - - // If the existing instance with the same logicalId is not in the same FAULT_ZONE as this instance, we cannot - // add this instance. - if (!foundInstanceConfigsWithMatchingLogicalId.get(0).getDomainAsMap() - .containsKey(faultZoneKey) || !instanceConfig.getDomainAsMap().containsKey(faultZoneKey) - || !foundInstanceConfigsWithMatchingLogicalId.get(0).getDomainAsMap().get(faultZoneKey) - .equals(instanceConfig.getDomainAsMap().get(faultZoneKey))) { - throw new HelixException( - "Instance can only be added if the SWAP_OUT instance sharing the same logicalId is in the same FAULT_ZONE" - + " as this instance. " + "Existing instance: " - + foundInstanceConfigsWithMatchingLogicalId.get(0).getInstanceName() - + " has FAULT_ZONE_TYPE: " + foundInstanceConfigsWithMatchingLogicalId.get(0) - .getDomainAsMap().get(faultZoneKey) + " and this instance: " + nodeId - + " has FAULT_ZONE_TYPE: " + instanceConfig.getDomainAsMap().get(faultZoneKey)); - } - - Map foundInstanceCapacityMap = - foundInstanceConfigsWithMatchingLogicalId.get(0).getInstanceCapacityMap().isEmpty() - ? clusterConfig.getDefaultInstanceCapacityMap() - : foundInstanceConfigsWithMatchingLogicalId.get(0).getInstanceCapacityMap(); - Map instanceCapacityMap = instanceConfig.getInstanceCapacityMap().isEmpty() - ? clusterConfig.getDefaultInstanceCapacityMap() - : instanceConfig.getInstanceCapacityMap(); - // If the instance does not have the same capacity, we cannot add this instance. - if (!new EqualsBuilder().append(foundInstanceCapacityMap, instanceCapacityMap).isEquals()) { - throw new HelixException( - "Instance can only be added if the SWAP_OUT instance sharing the same logicalId has the same capacity" - + " as this instance. " + "Existing instance: " - + foundInstanceConfigsWithMatchingLogicalId.get(0).getInstanceName() - + " has capacity: " + foundInstanceCapacityMap + " and this instance: " + nodeId - + " has capacity: " + instanceCapacityMap); - } - } else if (foundInstanceConfigsWithMatchingLogicalId.get(0).getInstanceOperation() - .equals(InstanceConstants.InstanceOperation.EVACUATE.name())) { - // No need to check anything on the new node, the old node will be evacuated and the new node - // will be added. - } else { - // If the instanceConfig.getInstanceEnabled() is true and the existing instance with the same logicalId - // does not have InstanceOperation set to one of the above, we cannot add this instance. - throw new HelixException( - "Instance can only be added if the exising instance sharing the same logicalId" - + " has InstanceOperation set to " - + InstanceConstants.InstanceOperation.SWAP_OUT.name() - + " and this instance has InstanceOperation set to " - + InstanceConstants.InstanceOperation.SWAP_IN.name() - + " or the existing instance sharing the same logicalId has Instance Operation set to " - + InstanceConstants.InstanceOperation.EVACUATE.name() - + " and this instance has InstanceOperation unset. Existing instance: " - + foundInstanceConfigsWithMatchingLogicalId.get(0).getInstanceName() - + " has InstanceOperation: " + foundInstanceConfigsWithMatchingLogicalId.get(0) - .getInstanceOperation()); - } - } else if (!instanceConfig.getInstanceOperation().isEmpty()) { - // If there are no instances with the same logicalId, we can only add this instance if InstanceOperation - // is unset because it is a new instance. - throw new HelixException( - "There is no instance with logicalId: " + toAddInstanceLogicalId + " in cluster: " - + clusterName + "; therefore, " + nodeId - + " cannot join cluster with InstanceOperation set to " - + instanceConfig.getInstanceOperation() + "."); + InstanceConstants.InstanceOperation attemptedInstanceOperation = + instanceConfig.getInstanceOperation().getOperation(); + try { + InstanceUtil.validateInstanceOperationTransition(_configAccessor, clusterName, instanceConfig, + InstanceConstants.InstanceOperation.UNKNOWN, attemptedInstanceOperation); + } catch (HelixException e) { + instanceConfig.setInstanceOperation(InstanceConstants.InstanceOperation.UNKNOWN); + logger.error("Failed to add instance " + instanceConfig.getInstanceName() + " to cluster " + + clusterName + " with instance operation " + attemptedInstanceOperation + + ". Setting INSTANCE_OPERATION to " + instanceConfig.getInstanceOperation() + .getOperation() + + " instead.", e); } ZKUtil.createChildren(_zkClient, instanceConfigsPath, instanceConfig.getRecord()); @@ -325,8 +242,7 @@ public void addInstance(String clusterName, InstanceConfig instanceConfig) { _zkClient.createPersistent(PropertyPathBuilder.instanceError(clusterName, nodeId), true); _zkClient.createPersistent(PropertyPathBuilder.instanceStatusUpdate(clusterName, nodeId), true); _zkClient.createPersistent(PropertyPathBuilder.instanceHistory(clusterName, nodeId), true); - HelixDataAccessor accessor = - new ZKHelixDataAccessor(clusterName, new ZkBaseDataAccessor(_zkClient)); + HelixDataAccessor accessor = new ZKHelixDataAccessor(clusterName, _baseDataAccessor); PropertyKey.Builder keyBuilder = accessor.keyBuilder(); accessor.setProperty(keyBuilder.participantHistory(nodeId), new ParticipantHistory(nodeId)); } @@ -429,8 +345,7 @@ public InstanceConfig getInstanceConfig(String clusterName, String instanceName) "instance" + instanceName + " does not exist in cluster " + clusterName); } - HelixDataAccessor accessor = - new ZKHelixDataAccessor(clusterName, new ZkBaseDataAccessor(_zkClient)); + HelixDataAccessor accessor = new ZKHelixDataAccessor(clusterName, _baseDataAccessor); PropertyKey.Builder keyBuilder = accessor.keyBuilder(); return accessor.getProperty(keyBuilder.instanceConfig(instanceName)); @@ -449,8 +364,7 @@ public boolean setInstanceConfig(String clusterName, String instanceName, "instance" + instanceName + " does not exist in cluster " + clusterName); } - HelixDataAccessor accessor = - new ZKHelixDataAccessor(clusterName, new ZkBaseDataAccessor(_zkClient)); + HelixDataAccessor accessor = new ZKHelixDataAccessor(clusterName, _baseDataAccessor); PropertyKey instanceConfigPropertyKey = accessor.keyBuilder().instanceConfig(instanceName); InstanceConfig currentInstanceConfig = accessor.getProperty(instanceConfigPropertyKey); if (!newInstanceConfig.getHostName().equals(currentInstanceConfig.getHostName()) @@ -464,12 +378,14 @@ public boolean setInstanceConfig(String clusterName, String instanceName, return accessor.setProperty(instanceConfigPropertyKey, newInstanceConfig); } + @Deprecated @Override public void enableInstance(final String clusterName, final String instanceName, final boolean enabled) { enableInstance(clusterName, instanceName, enabled, null, null); } + @Deprecated @Override public void enableInstance(final String clusterName, final String instanceName, final boolean enabled, InstanceConstants.InstanceDisabledType disabledType, String reason) { @@ -477,28 +393,12 @@ public void enableInstance(final String clusterName, final String instanceName, clusterName); BaseDataAccessor baseAccessor = new ZkBaseDataAccessor<>(_zkClient); - // If enabled is set to true and InstanceOperation is SWAP_IN, we should fail if there is not a - // matching SWAP_OUT instance. - InstanceConfig instanceConfig = getInstanceConfig(clusterName, instanceName); - if (enabled && instanceConfig.getInstanceOperation() - .equals(InstanceConstants.InstanceOperation.SWAP_IN.name())) { - InstanceConfig matchingSwapInstance = findMatchingSwapInstance(clusterName, instanceConfig); - if (matchingSwapInstance == null || !matchingSwapInstance.getInstanceOperation() - .equals(InstanceConstants.InstanceOperation.SWAP_OUT.name())) { - throw new HelixException("Instance cannot be enabled if InstanceOperation is set to " - + instanceConfig.getInstanceOperation() + " when there is no matching " - + InstanceConstants.InstanceOperation.SWAP_OUT.name() + " instance."); - } - } - // Eventually we will have all instances' enable/disable information in clusterConfig. Now we // update both instanceConfig and clusterConfig in transition period. enableSingleInstance(clusterName, instanceName, enabled, baseAccessor, disabledType, reason); -// enableBatchInstances(clusterName, Collections.singletonList(instanceName), enabled, -// baseAccessor, disabledType, reason); - } + @Deprecated @Override public void enableInstance(String clusterName, List instances, boolean enabled) { // TODO: batch enable/disable is breaking backward compatibility on instance enable with older library @@ -509,132 +409,67 @@ public void enableInstance(String clusterName, List instances, boolean e //enableInstance(clusterName, instances, enabled, null, null); } + /** + * Set the InstanceOperation of an instance in the cluster. + * + * @param clusterName The cluster name + * @param instanceName The instance name + * @param instanceOperation The instance operation + */ @Override - // TODO: Name may change in future public void setInstanceOperation(String clusterName, String instanceName, @Nullable InstanceConstants.InstanceOperation instanceOperation) { + setInstanceOperation(clusterName, instanceName, instanceOperation, null, false); + } - BaseDataAccessor baseAccessor = new ZkBaseDataAccessor<>(_zkClient); - String path = PropertyPathBuilder.instanceConfig(clusterName, instanceName); - - // InstanceOperation can only be set to SWAP_IN when the instance is added to the cluster - // or if it is disabled. - if (instanceOperation != null && instanceOperation.equals( - InstanceConstants.InstanceOperation.SWAP_IN) && getInstanceConfig(clusterName, - instanceName).getInstanceEnabled()) { - throw new HelixException("InstanceOperation should only be set to " - + InstanceConstants.InstanceOperation.SWAP_IN.name() - + " when an instance joins the cluster for the first time(when " - + "creating the InstanceConfig) or is disabled."); - } - - // InstanceOperation cannot be set to null if there is an instance with the same logicalId in - // the cluster which does not have InstanceOperation set to SWAP_IN or SWAP_OUT. - if (instanceOperation == null) { - InstanceConfig instanceConfig = getInstanceConfig(clusterName, instanceName); - String logicalIdKey = ClusterTopologyConfig.createFromClusterConfig( - _configAccessor.getClusterConfig(clusterName)).getEndNodeType(); - - HelixConfigScope instanceConfigScope = - new HelixConfigScopeBuilder(HelixConfigScope.ConfigScopeProperty.PARTICIPANT, - clusterName).build(); - List existingInstanceIds = getConfigKeys(instanceConfigScope); - List matchingInstancesWithNonSwappingInstanceOperation = - existingInstanceIds.parallelStream() - .map(existingInstanceId -> getInstanceConfig(clusterName, existingInstanceId)).filter( - existingInstanceConfig -> - !existingInstanceConfig.getInstanceName().equals(instanceName) - && existingInstanceConfig.getLogicalId(logicalIdKey) - .equals(instanceConfig.getLogicalId(logicalIdKey)) - && !existingInstanceConfig.getInstanceOperation() - .equals(InstanceConstants.InstanceOperation.SWAP_IN.name()) - && !existingInstanceConfig.getInstanceOperation() - .equals(InstanceConstants.InstanceOperation.SWAP_OUT.name()) - && !existingInstanceConfig.getInstanceOperation() - .equals(InstanceConstants.InstanceOperation.EVACUATE.name())) - .collect(Collectors.toList()); - - if (!matchingInstancesWithNonSwappingInstanceOperation.isEmpty()) { - throw new HelixException("InstanceOperation cannot be set to null for " + instanceName - + " if there are other instances with the same logicalId in the cluster that do not have" - + " InstanceOperation set to SWAP_IN, SWAP_OUT, or EVACUATE."); - } - } - - if (!baseAccessor.exists(path, 0)) { - throw new HelixException( - "Cluster " + clusterName + ", instance: " + instanceName + ", instance config does not exist"); - } - - boolean succeeded = baseAccessor.update(path, new DataUpdater() { - @Override - public ZNRecord update(ZNRecord currentData) { - if (currentData == null) { - throw new HelixException( - "Cluster: " + clusterName + ", instance: " + instanceName + ", participant config is null"); - } - - InstanceConfig config = new InstanceConfig(currentData); - config.setInstanceOperation(instanceOperation); - return config.getRecord(); - } - }, AccessOption.PERSISTENT); + /** + * Set the instanceOperation of and instance with {@link InstanceConstants.InstanceOperation}. + * + * @param clusterName The cluster name + * @param instanceName The instance name + * @param instanceOperation The instance operation type + * @param reason The reason for the operation + */ + @Override + public void setInstanceOperation(String clusterName, String instanceName, + @Nullable InstanceConstants.InstanceOperation instanceOperation, String reason) { + setInstanceOperation(clusterName, instanceName, instanceOperation, reason, false); + } - if (!succeeded) { - throw new HelixException("Failed to update instance operation. Please check if instance is disabled."); - } + /** + * Set the instanceOperation of and instance with {@link InstanceConstants.InstanceOperation}. + * + * @param clusterName The cluster name + * @param instanceName The instance name + * @param instanceOperation The instance operation type + * @param reason The reason for the operation + * @param overrideAll Whether to override all existing instance operations from all other + * instance operations + */ + @Override + public void setInstanceOperation(String clusterName, String instanceName, + @Nullable InstanceConstants.InstanceOperation instanceOperation, String reason, + boolean overrideAll) { + InstanceConfig.InstanceOperation instanceOperationObj = + new InstanceConfig.InstanceOperation.Builder().setOperation( + instanceOperation == null ? InstanceConstants.InstanceOperation.ENABLE + : instanceOperation).setReason(reason).setSource( + overrideAll ? InstanceConstants.InstanceOperationSource.ADMIN + : InstanceConstants.InstanceOperationSource.USER).build(); + InstanceUtil.setInstanceOperation(_configAccessor, _baseDataAccessor, clusterName, instanceName, + instanceOperationObj); } @Override public boolean isEvacuateFinished(String clusterName, String instanceName) { if (!instanceHasFullAutoCurrentStateOrMessage(clusterName, instanceName)) { InstanceConfig config = getInstanceConfig(clusterName, instanceName); - return config != null && config.getInstanceOperation().equals(InstanceConstants.InstanceOperation.EVACUATE.name()); + return config != null && config.getInstanceOperation().getOperation() + .equals(InstanceConstants.InstanceOperation.EVACUATE); } return false; } - /** - * Find the instance that the passed instance is swapping with. If the passed instance has - * SWAP_OUT instanceOperation, then find the corresponding instance that has SWAP_IN - * instanceOperation. If the passed instance has SWAP_IN instanceOperation, then find the - * corresponding instance that has SWAP_OUT instanceOperation. - * - * @param clusterName The cluster name - * @param instanceConfig The instance to find the swap instance for - * @return The swap instance if found, null otherwise. - */ - @Nullable - private InstanceConfig findMatchingSwapInstance(String clusterName, - InstanceConfig instanceConfig) { - String logicalIdKey = - ClusterTopologyConfig.createFromClusterConfig(_configAccessor.getClusterConfig(clusterName)) - .getEndNodeType(); - - for (String potentialSwappingInstance : getConfigKeys( - new HelixConfigScopeBuilder(HelixConfigScope.ConfigScopeProperty.PARTICIPANT, - clusterName).build())) { - InstanceConfig potentialSwappingInstanceConfig = - getInstanceConfig(clusterName, potentialSwappingInstance); - - // Return if there is a matching Instance with the same logicalId and opposite InstanceOperation swap operation. - if (potentialSwappingInstanceConfig.getLogicalId(logicalIdKey) - .equals(instanceConfig.getLogicalId(logicalIdKey)) && ( - instanceConfig.getInstanceOperation() - .equals(InstanceConstants.InstanceOperation.SWAP_IN.name()) - && potentialSwappingInstanceConfig.getInstanceOperation() - .equals(InstanceConstants.InstanceOperation.SWAP_OUT.name())) || ( - instanceConfig.getInstanceOperation() - .equals(InstanceConstants.InstanceOperation.SWAP_OUT.name()) - && potentialSwappingInstanceConfig.getInstanceOperation() - .equals(InstanceConstants.InstanceOperation.SWAP_IN.name()))) { - return potentialSwappingInstanceConfig; - } - } - - return null; - } - /** * Check to see if swapping between two instances is ready to be completed. Checks: 1. Both * instances must be alive. 2. Both instances must only have one session and not be carrying over @@ -650,7 +485,7 @@ private InstanceConfig findMatchingSwapInstance(String clusterName, */ private boolean canCompleteSwap(String clusterName, String swapOutInstanceName, String swapInInstanceName) { - BaseDataAccessor baseAccessor = new ZkBaseDataAccessor(_zkClient); + BaseDataAccessor baseAccessor = _baseDataAccessor; HelixDataAccessor accessor = new ZKHelixDataAccessor(clusterName, baseAccessor); PropertyKey.Builder keyBuilder = accessor.keyBuilder(); @@ -661,14 +496,13 @@ private boolean canCompleteSwap(String clusterName, String swapOutInstanceName, accessor.getProperty(keyBuilder.liveInstance(swapInInstanceName)); InstanceConfig swapOutInstanceConfig = getInstanceConfig(clusterName, swapOutInstanceName); InstanceConfig swapInInstanceConfig = getInstanceConfig(clusterName, swapInInstanceName); - if (swapInLiveInstance == null || !swapInInstanceConfig.getInstanceEnabled()) { + if (swapInLiveInstance == null) { logger.warn( - "SwapOutInstance {} is {} + {} and SwapInInstance {} is {} + {} for cluster {}. Swap will" - + " not complete unless SwapInInstance instance is ENABLED and ONLINE.", + "SwapOutInstance {} is {} + {} and SwapInInstance {} is OFFLINE + {} for cluster {}. Swap will" + + " not complete unless SwapInInstance instance is ONLINE.", swapOutInstanceName, swapOutLiveInstance != null ? "ONLINE" : "OFFLINE", - swapOutInstanceConfig.getInstanceEnabled() ? "ENABLED" : "DISABLED", swapInInstanceName, - swapInLiveInstance != null ? "ONLINE" : "OFFLINE", - swapInInstanceConfig.getInstanceEnabled() ? "ENABLED" : "DISABLED", clusterName); + swapOutInstanceConfig.getInstanceOperation().getOperation(), swapInInstanceName, + swapInInstanceConfig.getInstanceOperation().getOperation(), clusterName); return false; } @@ -705,21 +539,15 @@ private boolean canCompleteSwap(String clusterName, String swapOutInstanceName, return false; } - // 4. Collect a list of all partitions that have a current state on swapOutInstance - String swapOutLastActiveSession; - if (swapOutLiveInstance == null) { - // SwapOutInstance is down, try to find the last active session - if (swapOutSessions.size() != 1) { - logger.warn( - "SwapOutInstance {} is offline and has {} sessions for cluster {}. Swap can't be " - + "verified if last active session can't be determined. There should only be one session.", - swapOutInstanceName, swapOutSessions.size(), clusterName); - return false; - } - swapOutLastActiveSession = swapOutSessions.get(0); - } else { - swapOutLastActiveSession = swapOutLiveInstance.getEphemeralOwner(); + // 4. If the swap-out instance is not alive or is disabled, we return true without checking + // the current states on the swap-in instance. + if (swapOutLiveInstance == null || swapOutInstanceConfig.getInstanceOperation().getOperation() + .equals(InstanceConstants.InstanceOperation.DISABLE)) { + return true; } + + // 5. Collect a list of all partitions that have a current state on swapOutInstance + String swapOutLastActiveSession = swapOutLiveInstance.getEphemeralOwner(); String swapInActiveSession = swapInLiveInstance.getEphemeralOwner(); // Iterate over all resources with current states on the swapOutInstance @@ -754,24 +582,22 @@ private boolean canCompleteSwap(String clusterName, String swapOutInstanceName, String swapOutPartitionState = swapOutResourceCurrentState.getState(partitionName); String swapInPartitionState = swapInResourceCurrentState.getState(partitionName); - // SwapInInstance should not have any partitions in ERROR state. - if (swapInPartitionState.equals(HelixDefinedState.ERROR.name())) { - logger.warn( - "SwapOutInstance {} has partition {} in state {} and SwapInInstance {} has partition {} in state {} for cluster {}." - + " Swap will not complete unless both instances have no partitions in ERROR state.", - swapOutInstanceName, partitionName, swapOutPartitionState, swapInInstanceName, - partitionName, swapInPartitionState, clusterName); - return false; - } - - // The state of the partition on the swapInInstance be in the topState or a secondTopState. - // It should be in a topState only if the state model allows multiple replicas in the topState. - // In all other cases it should be a secondTopState. - if (!(swapInPartitionState.equals(topState) || secondTopStates.contains( + // SwapInInstance should have the correct state for the partition. + // All states should match except for the case where the topState is not ALL_REPLICAS or ALL_CANDIDATE_NODES + // or the swap-out partition is ERROR state. + // When the topState is not ALL_REPLICAS or ALL_CANDIDATE_NODES, the swap-in partition should be in a secondTopStates. + if (!(swapOutPartitionState.equals(HelixDefinedState.ERROR.name()) || ( + topState.equals(swapOutPartitionState) && ( + swapOutPartitionState.equals(swapInPartitionState) || + !ImmutableSet.of(StateModelDefinition.STATE_REPLICA_COUNT_ALL_REPLICAS, + StateModelDefinition.STATE_REPLICA_COUNT_ALL_CANDIDATE_NODES).contains( + stateModelDefinition.getNumInstancesPerState( + stateModelDefinition.getTopState())) && secondTopStates.contains( + swapInPartitionState))) || swapOutPartitionState.equals( swapInPartitionState))) { logger.warn( "SwapOutInstance {} has partition {} in {} but SwapInInstance {} has partition {} in state {} for cluster {}." - + " Swap will not complete unless SwapInInstance has partition in topState or secondState.", + + " Swap will not complete unless SwapInInstance has partition in correct states.", swapOutInstanceName, partitionName, swapOutPartitionState, swapInInstanceName, partitionName, swapInPartitionState, clusterName); return false; @@ -792,12 +618,22 @@ public boolean canCompleteSwap(String clusterName, String instanceName) { return false; } - InstanceConfig swapOutInstanceConfig = instanceConfig.getInstanceOperation() - .equals(InstanceConstants.InstanceOperation.SWAP_OUT.name()) ? instanceConfig - : findMatchingSwapInstance(clusterName, instanceConfig); - InstanceConfig swapInInstanceConfig = instanceConfig.getInstanceOperation() - .equals(InstanceConstants.InstanceOperation.SWAP_IN.name()) ? instanceConfig - : findMatchingSwapInstance(clusterName, instanceConfig); + List swappingInstances = + InstanceUtil.findInstancesWithMatchingLogicalId(_configAccessor, clusterName, + instanceConfig); + if (swappingInstances.size() != 1) { + logger.warn( + "Instance {} in cluster {} is not swapping with any other instance. Cannot determine if the swap is complete.", + instanceName, clusterName); + return false; + } + + InstanceConfig swapOutInstanceConfig = !instanceConfig.getInstanceOperation().getOperation() + .equals(InstanceConstants.InstanceOperation.SWAP_IN) + ? instanceConfig : swappingInstances.get(0); + InstanceConfig swapInInstanceConfig = instanceConfig.getInstanceOperation().getOperation() + .equals(InstanceConstants.InstanceOperation.SWAP_IN) ? instanceConfig + : swappingInstances.get(0); if (swapOutInstanceConfig == null || swapInInstanceConfig == null) { logger.warn( "Instance {} in cluster {} is not swapping with any other instance. Cannot determine if the swap is complete.", @@ -821,12 +657,22 @@ public boolean completeSwapIfPossible(String clusterName, String instanceName, return false; } - InstanceConfig swapOutInstanceConfig = instanceConfig.getInstanceOperation() - .equals(InstanceConstants.InstanceOperation.SWAP_OUT.name()) ? instanceConfig - : findMatchingSwapInstance(clusterName, instanceConfig); - InstanceConfig swapInInstanceConfig = instanceConfig.getInstanceOperation() - .equals(InstanceConstants.InstanceOperation.SWAP_IN.name()) ? instanceConfig - : findMatchingSwapInstance(clusterName, instanceConfig); + List swappingInstances = + InstanceUtil.findInstancesWithMatchingLogicalId(_configAccessor, clusterName, + instanceConfig); + if (swappingInstances.size() != 1) { + logger.warn( + "Instance {} in cluster {} is not swapping with any other instance. Cannot determine if the swap is complete.", + instanceName, clusterName); + return false; + } + + InstanceConfig swapOutInstanceConfig = !instanceConfig.getInstanceOperation().getOperation() + .equals(InstanceConstants.InstanceOperation.SWAP_IN) + ? instanceConfig : swappingInstances.get(0); + InstanceConfig swapInInstanceConfig = instanceConfig.getInstanceOperation().getOperation() + .equals(InstanceConstants.InstanceOperation.SWAP_IN) ? instanceConfig + : swappingInstances.get(0); if (swapOutInstanceConfig == null || swapInInstanceConfig == null) { logger.warn( "Instance {} in cluster {} is not swapping with any other instance. Cannot determine if the swap is complete.", @@ -840,11 +686,39 @@ public boolean completeSwapIfPossible(String clusterName, String instanceName, return false; } - // Complete the swap by removing the InstanceOperation for the SWAP_IN node and disabling the SWAP_OUT node. - setInstanceOperation(clusterName, swapInInstanceConfig.getInstanceName(), null); - enableInstance(clusterName, swapOutInstanceConfig.getInstanceName(), false); + BaseDataAccessor baseAccessor = new ZkBaseDataAccessor<>(_zkClient); + String swapInInstanceConfigPath = + PropertyPathBuilder.instanceConfig(clusterName, swapInInstanceConfig.getInstanceName()); + String swapOutInstanceConfigPath = + PropertyPathBuilder.instanceConfig(clusterName, swapOutInstanceConfig.getInstanceName()); + + Map> updaterMap = new HashMap<>(); + updaterMap.put(swapInInstanceConfigPath, currentData -> { + if (currentData == null) { + throw new HelixException("Cluster: " + clusterName + ", instance: " + instanceName + + ", SWAP_IN instance config is null"); + } - return true; + InstanceConfig currentSwapOutInstanceConfig = + getInstanceConfig(clusterName, swapOutInstanceConfig.getInstanceName()); + InstanceConfig config = new InstanceConfig(currentData); + config.overwriteInstanceConfig(currentSwapOutInstanceConfig); + // Special handling in case the swap-out instance does not have HELIX_ENABLED or InstanceOperation set. + return config.getRecord(); + }); + + updaterMap.put(swapOutInstanceConfigPath, currentData -> { + if (currentData == null) { + throw new HelixException("Cluster: " + clusterName + ", instance: " + instanceName + + ", swap out instance config is null"); + } + + InstanceConfig config = new InstanceConfig(currentData); + config.setInstanceOperation(InstanceConstants.InstanceOperation.UNKNOWN); + return config.getRecord(); + }); + + return baseAccessor.multiSet(updaterMap); } @Override @@ -852,7 +726,7 @@ public boolean isReadyForPreparingJoiningCluster(String clusterName, String inst if (!instanceHasFullAutoCurrentStateOrMessage(clusterName, instanceName)) { InstanceConfig config = getInstanceConfig(clusterName, instanceName); return config != null && INSTANCE_OPERATION_TO_EXCLUDE_FROM_ASSIGNMENT.contains( - config.getInstanceOperation()); + config.getInstanceOperation().getOperation()); } return false; } @@ -866,7 +740,7 @@ public boolean isReadyForPreparingJoiningCluster(String clusterName, String inst */ private boolean instanceHasFullAutoCurrentStateOrMessage(String clusterName, String instanceName) { - HelixDataAccessor accessor = new ZKHelixDataAccessor(clusterName, new ZkBaseDataAccessor(_zkClient)); + HelixDataAccessor accessor = new ZKHelixDataAccessor(clusterName, _baseDataAccessor); PropertyKey.Builder keyBuilder = accessor.keyBuilder(); // check the instance is alive @@ -877,7 +751,7 @@ private boolean instanceHasFullAutoCurrentStateOrMessage(String clusterName, return false; } - BaseDataAccessor baseAccessor = new ZkBaseDataAccessor(_zkClient); + BaseDataAccessor baseAccessor = _baseDataAccessor; // count number of sessions under CurrentState folder. If it is carrying over from prv session, // then there are > 1 session ZNodes. List sessions = baseAccessor.getChildNames(PropertyPathBuilder.instanceCurrentState(clusterName, instanceName), 0); @@ -917,7 +791,7 @@ private boolean instanceHasFullAutoCurrentStateOrMessage(String clusterName, public void enableResource(final String clusterName, final String resourceName, final boolean enabled) { logger.info("{} resource {} in cluster {}.", enabled ? "Enable" : "Disable", resourceName, clusterName); String path = PropertyPathBuilder.idealState(clusterName, resourceName); - BaseDataAccessor baseAccessor = new ZkBaseDataAccessor(_zkClient); + BaseDataAccessor baseAccessor = _baseDataAccessor; if (!baseAccessor.exists(path, 0)) { throw new HelixException("Cluster " + clusterName + ", resource: " + resourceName + ", ideal-state does not exist"); @@ -944,7 +818,7 @@ public void enablePartition(final boolean enabled, final String clusterName, instanceName, clusterName); String path = PropertyPathBuilder.instanceConfig(clusterName, instanceName); - BaseDataAccessor baseAccessor = new ZkBaseDataAccessor(_zkClient); + BaseDataAccessor baseAccessor = _baseDataAccessor; // check instanceConfig exists if (!baseAccessor.exists(path, 0)) { @@ -1023,8 +897,7 @@ public void enableCluster(String clusterName, boolean enabled) { public void enableCluster(String clusterName, boolean enabled, String reason) { logger.info("{} cluster {} for reason {}.", enabled ? "Enable" : "Disable", clusterName, reason == null ? "NULL" : reason); - HelixDataAccessor accessor = - new ZKHelixDataAccessor(clusterName, new ZkBaseDataAccessor(_zkClient)); + HelixDataAccessor accessor = new ZKHelixDataAccessor(clusterName, _baseDataAccessor); PropertyKey.Builder keyBuilder = accessor.keyBuilder(); if (enabled) { @@ -1048,8 +921,7 @@ public void enableMaintenanceMode(String clusterName, boolean enabled) { @Override public boolean isInMaintenanceMode(String clusterName) { - HelixDataAccessor accessor = - new ZKHelixDataAccessor(clusterName, new ZkBaseDataAccessor(_zkClient)); + HelixDataAccessor accessor = new ZKHelixDataAccessor(clusterName, _baseDataAccessor); PropertyKey.Builder keyBuilder = accessor.keyBuilder(); return accessor.getBaseDataAccessor() .exists(keyBuilder.maintenance().getPath(), AccessOption.PERSISTENT); @@ -1085,6 +957,136 @@ public ClusterManagementMode getClusterManagementMode(String clusterName) { : new ClusterManagementMode(status.getManagementMode(), status.getManagementModeStatus()); } + @Override + public void setPartitionsToError(String clusterName, String instanceName, String resourceName, + List partitionNames) { + logger.info("Set partitions {} for resource {} on instance {} in cluster {} to ERROR state.", + partitionNames == null ? "NULL" : HelixUtil.serializeByComma(partitionNames), resourceName, + instanceName, clusterName); + sendStateTransitionMessage(clusterName, instanceName, resourceName, partitionNames, + StateTransitionType.SET_TO_ERROR); + } + + private void sendStateTransitionMessage(String clusterName, String instanceName, + String resourceName, List partitionNames, StateTransitionType stateTransitionType) { + HelixDataAccessor accessor = + new ZKHelixDataAccessor(clusterName, new ZkBaseDataAccessor(_zkClient)); + PropertyKey.Builder keyBuilder = accessor.keyBuilder(); + + // check the instance is alive + LiveInstance liveInstance = accessor.getProperty(keyBuilder.liveInstance(instanceName)); + if (liveInstance == null) { + // check if the instance exists in the cluster + String instanceConfigPath = PropertyPathBuilder.instanceConfig(clusterName, instanceName); + throw new HelixException(String.format( + (_zkClient.exists(instanceConfigPath) ? SetPartitionFailureReason.INSTANCE_NOT_ALIVE + : SetPartitionFailureReason.INSTANCE_NON_EXISTENT).getMessage(resourceName, + partitionNames, instanceName, instanceName, clusterName, stateTransitionType))); + } + + // check resource exists in ideal state + IdealState idealState = accessor.getProperty(keyBuilder.idealStates(resourceName)); + if (idealState == null) { + throw new HelixException( + String.format(SetPartitionFailureReason.RESOURCE_NON_EXISTENT.getMessage(resourceName, + partitionNames, instanceName, resourceName, clusterName, stateTransitionType))); + } + + // check partition exists in resource + Set partitionsNames = new HashSet(partitionNames); + Set partitions = (idealState.getRebalanceMode() == RebalanceMode.CUSTOMIZED) + ? idealState.getRecord().getMapFields().keySet() + : idealState.getRecord().getListFields().keySet(); + if (!partitions.containsAll(partitionsNames)) { + throw new HelixException( + String.format(SetPartitionFailureReason.PARTITION_NON_EXISTENT.getMessage(resourceName, + partitionNames, instanceName, partitionNames.toString(), clusterName, stateTransitionType))); + } + + // check partition is in ERROR state if reset is set to True + String sessionId = liveInstance.getEphemeralOwner(); + CurrentState curState = + accessor.getProperty(keyBuilder.currentState(instanceName, sessionId, resourceName)); + if (stateTransitionType.equals(StateTransitionType.RESET)) { + for (String partitionName : partitionNames) { + if (!curState.getState(partitionName).equals(HelixDefinedState.ERROR.toString())) { + throw new HelixException(String.format( + SetPartitionFailureReason.PARTITION_NOT_ERROR.getMessage(resourceName, partitionNames, + instanceName, partitionNames.toString(), clusterName, stateTransitionType))); + } + } + } + + // check stateModelDef exists + String stateModelDef = idealState.getStateModelDefRef(); + StateModelDefinition stateModel = accessor.getProperty(keyBuilder.stateModelDef(stateModelDef)); + if (stateModel == null) { + throw new HelixException( + String.format(SetPartitionFailureReason.STATE_MODEL_NON_EXISTENT.getMessage(resourceName, + partitionNames, instanceName, stateModelDef, clusterName, stateTransitionType))); + } + + // check there is no pending messages for the partitions exist + List messages = accessor.getChildValues(keyBuilder.messages(instanceName), true); + for (Message message : messages) { + if (!MessageType.STATE_TRANSITION.name().equalsIgnoreCase(message.getMsgType()) + || !sessionId.equals(message.getTgtSessionId()) + || !resourceName.equals(message.getResourceName()) + || !partitionsNames.contains(message.getPartitionName())) { + continue; + } + + throw new HelixException(String.format( + "Can't %s state for %s.%s on %s, because a pending message %s exists for resource %s", + stateTransitionType.name(), resourceName, partitionNames, instanceName, message, + message.getResourceName())); + } + + String adminName = null; + try { + adminName = InetAddress.getLocalHost().getCanonicalHostName() + "-ADMIN"; + } catch (UnknownHostException e) { + logger.info("Unable to get host name. Will set it to UNKNOWN, mostly ignorable", e); + adminName = "UNKNOWN"; + } + + List stateTransitionMessages = new ArrayList(); + List messageKeys = new ArrayList(); + for (String partitionName : partitionNames) { + String msgId = UUID.randomUUID().toString(); + Message message = new Message(MessageType.STATE_TRANSITION, msgId); + message.setSrcName(adminName); + message.setTgtName(instanceName); + message.setMsgState(MessageState.NEW); + message.setPartitionName(partitionName); + message.setResourceName(resourceName); + message.setTgtSessionId(sessionId); + message.setStateModelDef(stateModelDef); + message.setStateModelFactoryName(idealState.getStateModelFactoryName()); + // if reset == TRUE, send ERROR to initialState message + // else, send * to ERROR state message + if (stateTransitionType.equals(StateTransitionType.RESET)) { + message.setFromState(HelixDefinedState.ERROR.toString()); + message.setToState(stateModel.getInitialState()); + } + if (stateTransitionType.equals(StateTransitionType.SET_TO_ERROR)) { + message.setFromState("*"); + message.setToState(HelixDefinedState.ERROR.toString()); + } + if (idealState.getResourceGroupName() != null) { + message.setResourceGroupName(idealState.getResourceGroupName()); + } + if (idealState.getInstanceGroupTag() != null) { + message.setResourceTag(idealState.getInstanceGroupTag()); + } + + stateTransitionMessages.add(message); + messageKeys.add(keyBuilder.message(instanceName, message.getId())); + } + + accessor.setChildren(messageKeys, stateTransitionMessages); + } + private void enableClusterPauseMode(String clusterName, boolean cancelPendingST, String reason) { String hostname = NetworkUtil.getLocalhostName(); logger.info( @@ -1168,8 +1170,7 @@ private void processMaintenanceMode(String clusterName, final boolean enabled, final String reason, final MaintenanceSignal.AutoTriggerReason internalReason, final Map customFields, final MaintenanceSignal.TriggeringEntity triggeringEntity) { - HelixDataAccessor accessor = - new ZKHelixDataAccessor(clusterName, new ZkBaseDataAccessor(_zkClient)); + HelixDataAccessor accessor = new ZKHelixDataAccessor(clusterName, _baseDataAccessor); PropertyKey.Builder keyBuilder = accessor.keyBuilder(); logger.info("Cluster {} {} {} maintenance mode for reason {}.", clusterName, triggeringEntity == MaintenanceSignal.TriggeringEntity.CONTROLLER ? "automatically" @@ -1230,7 +1231,7 @@ private void processMaintenanceMode(String clusterName, final boolean enabled, } } - private enum ResetPartitionFailureReason { + private enum SetPartitionFailureReason { INSTANCE_NOT_ALIVE("%s is not alive in cluster %s"), INSTANCE_NON_EXISTENT("%s does not exist in cluster %s"), RESOURCE_NON_EXISTENT("resource %s is not added to cluster %s"), @@ -1240,129 +1241,33 @@ private enum ResetPartitionFailureReason { private String message; - ResetPartitionFailureReason(String message) { + SetPartitionFailureReason(String message) { this.message = message; } public String getMessage(String resourceName, List partitionNames, String instanceName, - String errorStateEntity, String clusterName) { - return String.format("Can't reset state for %s.%s on %s, because " + message, resourceName, - partitionNames, instanceName, errorStateEntity, clusterName); + String errorStateEntity, String clusterName, StateTransitionType stateTransitionType) { + return String.format("Can't %s state for %s.%s on %s, because " + message, + stateTransitionType.name(), resourceName, partitionNames, instanceName, errorStateEntity, + clusterName); } } + private enum StateTransitionType { + // sets state from ERROR to INIT. + RESET, + // sets state from ANY to ERROR. + SET_TO_ERROR, + // Unknown StateTransitionType + UNDEFINED + } @Override public void resetPartition(String clusterName, String instanceName, String resourceName, List partitionNames) { logger.info("Reset partitions {} for resource {} on instance {} in cluster {}.", partitionNames == null ? "NULL" : HelixUtil.serializeByComma(partitionNames), resourceName, instanceName, clusterName); - HelixDataAccessor accessor = - new ZKHelixDataAccessor(clusterName, new ZkBaseDataAccessor(_zkClient)); - PropertyKey.Builder keyBuilder = accessor.keyBuilder(); - - // check the instance is alive - LiveInstance liveInstance = accessor.getProperty(keyBuilder.liveInstance(instanceName)); - if (liveInstance == null) { - // check if the instance exists in the cluster - String instanceConfigPath = PropertyPathBuilder.instanceConfig(clusterName, instanceName); - throw new HelixException(String.format( - (_zkClient.exists(instanceConfigPath) ? ResetPartitionFailureReason.INSTANCE_NOT_ALIVE - : ResetPartitionFailureReason.INSTANCE_NON_EXISTENT) - .getMessage(resourceName, partitionNames, instanceName, instanceName, clusterName))); - } - - // check resource group exists - IdealState idealState = accessor.getProperty(keyBuilder.idealStates(resourceName)); - if (idealState == null) { - throw new HelixException(String.format(ResetPartitionFailureReason.RESOURCE_NON_EXISTENT - .getMessage(resourceName, partitionNames, instanceName, resourceName, clusterName))); - } - - // check partition exists in resource group - Set resetPartitionNames = new HashSet(partitionNames); - Set partitions = - (idealState.getRebalanceMode() == RebalanceMode.CUSTOMIZED) ? idealState.getRecord() - .getMapFields().keySet() : idealState.getRecord().getListFields().keySet(); - if (!partitions.containsAll(resetPartitionNames)) { - throw new HelixException(String.format(ResetPartitionFailureReason.PARTITION_NON_EXISTENT - .getMessage(resourceName, partitionNames, instanceName, partitionNames.toString(), - clusterName))); - } - - // check partition is in ERROR state - String sessionId = liveInstance.getEphemeralOwner(); - CurrentState curState = - accessor.getProperty(keyBuilder.currentState(instanceName, sessionId, resourceName)); - for (String partitionName : resetPartitionNames) { - if (!curState.getState(partitionName).equals(HelixDefinedState.ERROR.toString())) { - throw new HelixException(String.format(ResetPartitionFailureReason.PARTITION_NOT_ERROR - .getMessage(resourceName, partitionNames, instanceName, partitionNames.toString(), - clusterName))); - } - } - - // check stateModelDef exists and get initial state - String stateModelDef = idealState.getStateModelDefRef(); - StateModelDefinition stateModel = accessor.getProperty(keyBuilder.stateModelDef(stateModelDef)); - if (stateModel == null) { - throw new HelixException(String.format(ResetPartitionFailureReason.STATE_MODEL_NON_EXISTENT - .getMessage(resourceName, partitionNames, instanceName, stateModelDef, clusterName))); - } - - // check there is no pending messages for the partitions exist - List messages = accessor.getChildValues(keyBuilder.messages(instanceName), true); - for (Message message : messages) { - if (!MessageType.STATE_TRANSITION.name().equalsIgnoreCase(message.getMsgType()) || !sessionId - .equals(message.getTgtSessionId()) || !resourceName.equals(message.getResourceName()) - || !resetPartitionNames.contains(message.getPartitionName())) { - continue; - } - - throw new HelixException(String.format( - "Can't reset state for %s.%s on %s, because a pending message %s exists for resource %s", - resourceName, partitionNames, instanceName, message.toString(), - message.getResourceName())); - } - - String adminName = null; - try { - adminName = InetAddress.getLocalHost().getCanonicalHostName() + "-ADMIN"; - } catch (UnknownHostException e) { - // can ignore it - logger.info("Unable to get host name. Will set it to UNKNOWN, mostly ignorable", e); - adminName = "UNKNOWN"; - } - - List resetMessages = new ArrayList(); - List messageKeys = new ArrayList(); - for (String partitionName : resetPartitionNames) { - // send ERROR to initialState message - String msgId = UUID.randomUUID().toString(); - Message message = new Message(MessageType.STATE_TRANSITION, msgId); - message.setSrcName(adminName); - message.setTgtName(instanceName); - message.setMsgState(MessageState.NEW); - message.setPartitionName(partitionName); - message.setResourceName(resourceName); - message.setTgtSessionId(sessionId); - message.setStateModelDef(stateModelDef); - message.setFromState(HelixDefinedState.ERROR.toString()); - message.setToState(stateModel.getInitialState()); - message.setStateModelFactoryName(idealState.getStateModelFactoryName()); - - if (idealState.getResourceGroupName() != null) { - message.setResourceGroupName(idealState.getResourceGroupName()); - } - if (idealState.getInstanceGroupTag() != null) { - message.setResourceTag(idealState.getInstanceGroupTag()); - } - - resetMessages.add(message); - messageKeys.add(keyBuilder.message(instanceName, message.getId())); - } - - accessor.setChildren(messageKeys, resetMessages); + sendStateTransitionMessage(clusterName, instanceName, resourceName, partitionNames, StateTransitionType.RESET); } @Override @@ -1528,8 +1433,7 @@ public List getInstancesInClusterWithTag(String clusterName, String tag) List instances = _zkClient.getChildren(memberInstancesPath); List result = new ArrayList(); - HelixDataAccessor accessor = - new ZKHelixDataAccessor(clusterName, new ZkBaseDataAccessor(_zkClient)); + HelixDataAccessor accessor = new ZKHelixDataAccessor(clusterName, _baseDataAccessor); PropertyKey.Builder keyBuilder = accessor.keyBuilder(); for (String instanceName : instances) { @@ -1675,8 +1579,7 @@ public List getResourcesInCluster(String clusterName) { public List getResourcesInClusterWithTag(String clusterName, String tag) { List resourcesWithTag = new ArrayList(); - HelixDataAccessor accessor = - new ZKHelixDataAccessor(clusterName, new ZkBaseDataAccessor(_zkClient)); + HelixDataAccessor accessor = new ZKHelixDataAccessor(clusterName, _baseDataAccessor); PropertyKey.Builder keyBuilder = accessor.keyBuilder(); for (String resourceName : getResourcesInCluster(clusterName)) { @@ -1691,8 +1594,7 @@ public List getResourcesInClusterWithTag(String clusterName, String tag) @Override public IdealState getResourceIdealState(String clusterName, String resourceName) { - HelixDataAccessor accessor = - new ZKHelixDataAccessor(clusterName, new ZkBaseDataAccessor(_zkClient)); + HelixDataAccessor accessor = new ZKHelixDataAccessor(clusterName, _baseDataAccessor); PropertyKey.Builder keyBuilder = accessor.keyBuilder(); return accessor.getProperty(keyBuilder.idealStates(resourceName)); @@ -1704,8 +1606,7 @@ public void setResourceIdealState(String clusterName, String resourceName, logger .info("Set IdealState for resource {} in cluster {} with new IdealState {}.", resourceName, clusterName, idealState == null ? "NULL" : idealState.toString()); - HelixDataAccessor accessor = - new ZKHelixDataAccessor(clusterName, new ZkBaseDataAccessor(_zkClient)); + HelixDataAccessor accessor = new ZKHelixDataAccessor(clusterName, _baseDataAccessor); PropertyKey.Builder keyBuilder = accessor.keyBuilder(); accessor.setProperty(keyBuilder.idealStates(resourceName), idealState); @@ -1747,8 +1648,7 @@ public void removeFromIdealState(String clusterName, String resourceName, IdealS @Override public ExternalView getResourceExternalView(String clusterName, String resourceName) { - HelixDataAccessor accessor = - new ZKHelixDataAccessor(clusterName, new ZkBaseDataAccessor(_zkClient)); + HelixDataAccessor accessor = new ZKHelixDataAccessor(clusterName, _baseDataAccessor); PropertyKey.Builder keyBuilder = accessor.keyBuilder(); return accessor.getProperty(keyBuilder.externalView(resourceName)); } @@ -1756,8 +1656,7 @@ public ExternalView getResourceExternalView(String clusterName, String resourceN @Override public CustomizedView getResourceCustomizedView(String clusterName, String resourceName, String customizedStateType) { - HelixDataAccessor accessor = - new ZKHelixDataAccessor(clusterName, new ZkBaseDataAccessor(_zkClient)); + HelixDataAccessor accessor = new ZKHelixDataAccessor(clusterName, _baseDataAccessor); PropertyKey.Builder keyBuilder = accessor.keyBuilder(); return accessor.getProperty(keyBuilder.customizedView(customizedStateType, resourceName)); } @@ -1790,8 +1689,7 @@ public void addStateModelDef(String clusterName, String stateModelDef, } } - HelixDataAccessor accessor = - new ZKHelixDataAccessor(clusterName, new ZkBaseDataAccessor(_zkClient)); + HelixDataAccessor accessor = new ZKHelixDataAccessor(clusterName, _baseDataAccessor); PropertyKey.Builder keyBuilder = accessor.keyBuilder(); accessor.setProperty(keyBuilder.stateModelDef(stateModelDef), stateModel); } @@ -1802,8 +1700,7 @@ public void dropResource(String clusterName, String resourceName) { if (!ZKUtil.isClusterSetup(clusterName, _zkClient)) { throw new HelixException("Cluster " + clusterName + " is not setup yet"); } - HelixDataAccessor accessor = - new ZKHelixDataAccessor(clusterName, new ZkBaseDataAccessor(_zkClient)); + HelixDataAccessor accessor = new ZKHelixDataAccessor(clusterName, _baseDataAccessor); PropertyKey.Builder keyBuilder = accessor.keyBuilder(); accessor.removeProperty(keyBuilder.idealStates(resourceName)); @@ -1822,8 +1719,7 @@ public void addCloudConfig(String clusterName, CloudConfig cloudConfig) { CloudConfig.Builder builder = new CloudConfig.Builder(cloudConfig); CloudConfig cloudConfigBuilder = builder.build(); - ZKHelixDataAccessor accessor = - new ZKHelixDataAccessor(clusterName, new ZkBaseDataAccessor(_zkClient)); + ZKHelixDataAccessor accessor = new ZKHelixDataAccessor(clusterName, _baseDataAccessor); PropertyKey.Builder keyBuilder = accessor.keyBuilder(); accessor.setProperty(keyBuilder.cloudConfig(), cloudConfigBuilder); } @@ -1831,8 +1727,7 @@ public void addCloudConfig(String clusterName, CloudConfig cloudConfig) { @Override public void removeCloudConfig(String clusterName) { logger.info("Remove Cloud Config for cluster {}.", clusterName); - HelixDataAccessor accessor = - new ZKHelixDataAccessor(clusterName, new ZkBaseDataAccessor(_zkClient)); + HelixDataAccessor accessor = new ZKHelixDataAccessor(clusterName, _baseDataAccessor); PropertyKey.Builder keyBuilder = accessor.keyBuilder(); accessor.removeProperty(keyBuilder.cloudConfig()); } @@ -1863,8 +1758,7 @@ public List getStateModelDefs(String clusterName) { @Override public StateModelDefinition getStateModelDef(String clusterName, String stateModelName) { - HelixDataAccessor accessor = - new ZKHelixDataAccessor(clusterName, new ZkBaseDataAccessor(_zkClient)); + HelixDataAccessor accessor = new ZKHelixDataAccessor(clusterName, _baseDataAccessor); PropertyKey.Builder keyBuilder = accessor.keyBuilder(); return accessor.getProperty(keyBuilder.stateModelDef(stateModelName)); @@ -1873,8 +1767,7 @@ public StateModelDefinition getStateModelDef(String clusterName, String stateMod @Override public void dropCluster(String clusterName) { logger.info("Deleting cluster {}.", clusterName); - HelixDataAccessor accessor = - new ZKHelixDataAccessor(clusterName, new ZkBaseDataAccessor(_zkClient)); + HelixDataAccessor accessor = new ZKHelixDataAccessor(clusterName, _baseDataAccessor); PropertyKey.Builder keyBuilder = accessor.keyBuilder(); String root = "/" + clusterName; @@ -1951,8 +1844,7 @@ public void addCustomizedStateConfig(String clusterName, new CustomizedStateConfig.Builder(customizedStateConfig); CustomizedStateConfig customizedStateConfigFromBuilder = builder.build(); - ZKHelixDataAccessor accessor = - new ZKHelixDataAccessor(clusterName, new ZkBaseDataAccessor(_zkClient)); + ZKHelixDataAccessor accessor = new ZKHelixDataAccessor(clusterName, _baseDataAccessor); PropertyKey.Builder keyBuilder = accessor.keyBuilder(); accessor.setProperty(keyBuilder.customizedStateConfig(), customizedStateConfigFromBuilder); @@ -1963,8 +1855,7 @@ public void removeCustomizedStateConfig(String clusterName) { logger.info( "Remove CustomizedStateConfig from cluster {}.", clusterName); - ZKHelixDataAccessor accessor = - new ZKHelixDataAccessor(clusterName, new ZkBaseDataAccessor(_zkClient)); + ZKHelixDataAccessor accessor = new ZKHelixDataAccessor(clusterName, _baseDataAccessor); PropertyKey.Builder keyBuilder = accessor.keyBuilder(); accessor.removeProperty(keyBuilder.customizedStateConfig()); @@ -1983,8 +1874,7 @@ public void addTypeToCustomizedStateConfig(String clusterName, String type) { builder.addAggregationEnabledType(type); CustomizedStateConfig customizedStateConfigFromBuilder = builder.build(); - ZKHelixDataAccessor accessor = - new ZKHelixDataAccessor(clusterName, new ZkBaseDataAccessor(_zkClient)); + ZKHelixDataAccessor accessor = new ZKHelixDataAccessor(clusterName, _baseDataAccessor); PropertyKey.Builder keyBuilder = accessor.keyBuilder(); if(!accessor.updateProperty(keyBuilder.customizedStateConfig(), customizedStateConfigFromBuilder)) { @@ -2013,8 +1903,7 @@ public void removeTypeFromCustomizedStateConfig(String clusterName, String type) builder.removeAggregationEnabledType(type); CustomizedStateConfig customizedStateConfigFromBuilder = builder.build(); - ZKHelixDataAccessor accessor = - new ZKHelixDataAccessor(clusterName, new ZkBaseDataAccessor(_zkClient)); + ZKHelixDataAccessor accessor = new ZKHelixDataAccessor(clusterName, _baseDataAccessor); PropertyKey.Builder keyBuilder = accessor.keyBuilder(); accessor.setProperty(keyBuilder.customizedStateConfig(), customizedStateConfigFromBuilder); @@ -2037,7 +1926,7 @@ public void rebalance(String clusterName, String resourceName, int replica) { @Override public void onDemandRebalance(String clusterName) { - BaseDataAccessor baseAccessor = new ZkBaseDataAccessor(_zkClient); + BaseDataAccessor baseAccessor = _baseDataAccessor; String path = PropertyPathBuilder.clusterConfig(clusterName); if (!baseAccessor.exists(path, 0)) { @@ -2220,7 +2109,7 @@ public void setConstraint(String clusterName, final ConstraintType constraintTyp final String constraintId, final ConstraintItem constraintItem) { logger.info("Set constraint type {} with constraint id {} for cluster {}.", constraintType, constraintId, clusterName); - BaseDataAccessor baseAccessor = new ZkBaseDataAccessor(_zkClient); + BaseDataAccessor baseAccessor = _baseDataAccessor; PropertyKey.Builder keyBuilder = new PropertyKey.Builder(clusterName); String path = keyBuilder.constraint(constraintType.toString()).getPath(); @@ -2243,7 +2132,7 @@ public void removeConstraint(String clusterName, final ConstraintType constraint final String constraintId) { logger.info("Remove constraint type {} with constraint id {} for cluster {}.", constraintType, constraintId, clusterName); - BaseDataAccessor baseAccessor = new ZkBaseDataAccessor(_zkClient); + BaseDataAccessor baseAccessor = _baseDataAccessor; PropertyKey.Builder keyBuilder = new PropertyKey.Builder(clusterName); String path = keyBuilder.constraint(constraintType.toString()).getPath(); @@ -2264,8 +2153,7 @@ public ZNRecord update(ZNRecord currentData) { @Override public ClusterConstraints getConstraints(String clusterName, ConstraintType constraintType) { - HelixDataAccessor accessor = - new ZKHelixDataAccessor(clusterName, new ZkBaseDataAccessor(_zkClient)); + HelixDataAccessor accessor = new ZKHelixDataAccessor(clusterName, _baseDataAccessor); PropertyKey.Builder keyBuilder = new PropertyKey.Builder(clusterName); return accessor.getProperty(keyBuilder.constraint(constraintType.toString())); @@ -2347,8 +2235,7 @@ public void addInstanceTag(String clusterName, String instanceName, String tag) throw new HelixException( "cluster " + clusterName + " instance " + instanceName + " is not setup yet"); } - HelixDataAccessor accessor = - new ZKHelixDataAccessor(clusterName, new ZkBaseDataAccessor(_zkClient)); + HelixDataAccessor accessor = new ZKHelixDataAccessor(clusterName, _baseDataAccessor); PropertyKey.Builder keyBuilder = accessor.keyBuilder(); InstanceConfig config = accessor.getProperty(keyBuilder.instanceConfig(instanceName)); @@ -2368,8 +2255,7 @@ public void removeInstanceTag(String clusterName, String instanceName, String ta throw new HelixException( "cluster " + clusterName + " instance " + instanceName + " is not setup yet"); } - ZKHelixDataAccessor accessor = - new ZKHelixDataAccessor(clusterName, new ZkBaseDataAccessor(_zkClient)); + ZKHelixDataAccessor accessor = new ZKHelixDataAccessor(clusterName, _baseDataAccessor); PropertyKey.Builder keyBuilder = accessor.keyBuilder(); InstanceConfig config = accessor.getProperty(keyBuilder.instanceConfig(instanceName)); @@ -2389,8 +2275,7 @@ public void setInstanceZoneId(String clusterName, String instanceName, String zo throw new HelixException( "cluster " + clusterName + " instance " + instanceName + " is not setup yet"); } - HelixDataAccessor accessor = - new ZKHelixDataAccessor(clusterName, new ZkBaseDataAccessor(_zkClient)); + HelixDataAccessor accessor = new ZKHelixDataAccessor(clusterName, _baseDataAccessor); PropertyKey.Builder keyBuilder = accessor.keyBuilder(); InstanceConfig config = accessor.getProperty(keyBuilder.instanceConfig(instanceName)); @@ -2427,6 +2312,7 @@ public void enableBatchMessageMode(String clusterName, String resourceName, bool setResourceIdealState(clusterName, resourceName, idealState); } + @Deprecated private void enableSingleInstance(final String clusterName, final String instanceName, final boolean enabled, BaseDataAccessor baseAccessor, InstanceConstants.InstanceDisabledType disabledType, String reason) { @@ -2446,23 +2332,19 @@ public ZNRecord update(ZNRecord currentData) { } InstanceConfig config = new InstanceConfig(currentData); - config.setInstanceEnabled(enabled); - if (!enabled) { - // new disabled type and reason will over write existing ones. - config.resetInstanceDisabledTypeAndReason(); - if (reason != null) { - config.setInstanceDisabledReason(reason); - } - if (disabledType != null) { - config.setInstanceDisabledType(disabledType); - } - } + config.setInstanceOperation(new InstanceConfig.InstanceOperation.Builder().setOperation( + enabled ? InstanceConstants.InstanceOperation.ENABLE + : InstanceConstants.InstanceOperation.DISABLE).setReason(reason).setSource( + disabledType != null + ? InstanceConstants.InstanceOperationSource.instanceDisabledTypeToInstanceOperationSource( + disabledType) : null).build()); return config.getRecord(); } }, AccessOption.PERSISTENT); } // TODO: Add history ZNode for all batched enabling/disabling histories with metadata. + @Deprecated private void enableBatchInstances(final String clusterName, final List instances, final boolean enabled, BaseDataAccessor baseAccessor, InstanceConstants.InstanceDisabledType disabledType, String reason) { @@ -2798,8 +2680,7 @@ private Set findTimeoutOfflineInstances(String clusterName, long offline } } - HelixDataAccessor accessor = - new ZKHelixDataAccessor(clusterName, new ZkBaseDataAccessor(_zkClient)); + HelixDataAccessor accessor = new ZKHelixDataAccessor(clusterName, _baseDataAccessor); PropertyKey.Builder keyBuilder = accessor.keyBuilder(); List instanceConfigNames = accessor.getChildNames(keyBuilder.instanceConfigs()); List instancePathNames = accessor.getChildNames(keyBuilder.instances()); diff --git a/helix-core/src/main/java/org/apache/helix/messaging/handling/HelixStateTransitionHandler.java b/helix-core/src/main/java/org/apache/helix/messaging/handling/HelixStateTransitionHandler.java index 0d67ced4b3..0a91370b07 100644 --- a/helix-core/src/main/java/org/apache/helix/messaging/handling/HelixStateTransitionHandler.java +++ b/helix-core/src/main/java/org/apache/helix/messaging/handling/HelixStateTransitionHandler.java @@ -176,7 +176,8 @@ void postHandleMessage() { deltaList.add(delta); _currentStateDelta.setDeltaList(deltaList); _stateModelFactory.removeStateModel(_message.getResourceName(), partitionKey); - } else if (_stateModel.getCurrentState().equals(_message.getFromState())) { + } else if (_message.getFromState().equals("*") + || _stateModel.getCurrentState().equals(_message.getFromState())) { // if the partition is not to be dropped, update _stateModel to the TO_STATE // need this check because TaskRunner may change _stateModel before reach here. _stateModel.updateState(toState); diff --git a/helix-core/src/main/java/org/apache/helix/messaging/handling/HelixTask.java b/helix-core/src/main/java/org/apache/helix/messaging/handling/HelixTask.java index 4470b99964..6a9473ebaa 100644 --- a/helix-core/src/main/java/org/apache/helix/messaging/handling/HelixTask.java +++ b/helix-core/src/main/java/org/apache/helix/messaging/handling/HelixTask.java @@ -323,6 +323,7 @@ private void reportMessageStat(HelixManager manager, Message message, HelixTaskR String fromState = message.getFromState(); String toState = message.getToState(); String transition = fromState + "--" + toState; + transition = transition.replaceAll("\\*", "ANY"); StateTransitionContext cxt = new StateTransitionContext(manager.getClusterName(), manager.getInstanceName(), diff --git a/helix-core/src/main/java/org/apache/helix/model/ClusterConfig.java b/helix-core/src/main/java/org/apache/helix/model/ClusterConfig.java index fdc05ac9bb..edb7a76c6d 100644 --- a/helix-core/src/main/java/org/apache/helix/model/ClusterConfig.java +++ b/helix-core/src/main/java/org/apache/helix/model/ClusterConfig.java @@ -75,6 +75,8 @@ public enum ClusterConfigProperty { // TODO: if we want to support this for other rebalancers, we need to implement that logic GLOBAL_MAX_PARTITIONS_ALLOWED_PER_INSTANCE, // The following two include offline AND disabled instances + // TODO: At some point we should rename this to something like MAX_INSTANCES_UNABLE_TO_TAKE_ACCEPT_REPLICAS + // to make it clear that it includes both offline and non-assignable instances MAX_OFFLINE_INSTANCES_ALLOWED, NUM_OFFLINE_INSTANCES_FOR_AUTO_EXIT, // For auto-exiting maintenance mode @@ -88,7 +90,9 @@ public enum ClusterConfigProperty { // state transition if the number of // partitons that need recovery or in // error exceeds this limitation + @Deprecated // TODO: Remove in Helix 2.0 DISABLED_INSTANCES, + @Deprecated // TODO: Remove in Helix 2.0 DISABLED_INSTANCES_WITH_INFO, // disabled instances and disabled instances with info are for storing batch disabled instances. // disabled instances will write into both 2 fields for backward compatibility. @@ -816,8 +820,11 @@ public void setDisabledInstancesWithInfo(Map disabledInstancesWi /** * Get current disabled instance map of + * @deprecated We will no longer be using the clusterConfig to disable instances + * please use the InstanceConfig to disable instances * @return a non-null map of disabled instances in cluster config */ + @Deprecated public Map getDisabledInstances() { Map disabledInstances = _record.getMapField(ClusterConfigProperty.DISABLED_INSTANCES.name()); @@ -827,8 +834,10 @@ public Map getDisabledInstances() { /** * Get current disabled instance map of * + * @deprecated Please use InstanceConfig for enabling and disabling instances * @return a non-null map of disabled instances in cluster config */ + @Deprecated public Map getDisabledInstancesWithInfo() { Map disabledInstances = _record.getMapField(ClusterConfigProperty.DISABLED_INSTANCES_WITH_INFO.name()); diff --git a/helix-core/src/main/java/org/apache/helix/model/InstanceConfig.java b/helix-core/src/main/java/org/apache/helix/model/InstanceConfig.java index 2f3da14569..1b3acd68d6 100644 --- a/helix-core/src/main/java/org/apache/helix/model/InstanceConfig.java +++ b/helix-core/src/main/java/org/apache/helix/model/InstanceConfig.java @@ -29,6 +29,12 @@ import java.util.Set; import java.util.stream.Collectors; +import javax.annotation.Nullable; + +import com.fasterxml.jackson.core.JsonProcessingException; +import com.fasterxml.jackson.core.type.TypeReference; +import com.fasterxml.jackson.databind.ObjectMapper; +import com.google.common.collect.ImmutableSet; import org.apache.helix.HelixException; import org.apache.helix.HelixProperty; import org.apache.helix.constants.InstanceConstants; @@ -50,10 +56,10 @@ public enum InstanceConfigProperty { HELIX_HOST, HELIX_PORT, HELIX_ZONE_ID, - HELIX_ENABLED, + @Deprecated HELIX_ENABLED, HELIX_ENABLED_TIMESTAMP, - HELIX_DISABLED_REASON, - HELIX_DISABLED_TYPE, + @Deprecated HELIX_DISABLED_REASON, + @Deprecated HELIX_DISABLED_TYPE, HELIX_DISABLED_PARTITION, TAG_LIST, INSTANCE_WEIGHT, @@ -61,18 +67,140 @@ public enum InstanceConfigProperty { DELAY_REBALANCE_ENABLED, MAX_CONCURRENT_TASK, INSTANCE_INFO_MAP, - INSTANCE_CAPACITY_MAP, - TARGET_TASK_THREAD_POOL_SIZE, - INSTANCE_OPERATION + INSTANCE_CAPACITY_MAP, TARGET_TASK_THREAD_POOL_SIZE, HELIX_INSTANCE_OPERATIONS + } + + public static class InstanceOperation { + private final Map _properties; + + private enum InstanceOperationProperties { + OPERATION, REASON, SOURCE, TIMESTAMP + } + + private InstanceOperation(@Nullable Map properties) { + // Default to ENABLE operation if no operation type is provided. + _properties = properties == null ? new HashMap<>() : properties; + if (!_properties.containsKey(InstanceOperationProperties.OPERATION.name())) { + _properties.put(InstanceOperationProperties.OPERATION.name(), + InstanceConstants.InstanceOperation.ENABLE.name()); + } + } + + public static class Builder { + private Map _properties = new HashMap<>(); + + /** + * Set the operation type for this instance operation. + * @param operationType InstanceOperation type of this instance operation. + */ + public Builder setOperation(@Nullable InstanceConstants.InstanceOperation operationType) { + _properties.put(InstanceOperationProperties.OPERATION.name(), + operationType == null ? InstanceConstants.InstanceOperation.ENABLE.name() + : operationType.name()); + return this; + } + + /** + * Set the reason for this instance operation. + * @param reason + */ + public Builder setReason(String reason) { + _properties.put(InstanceOperationProperties.REASON.name(), reason != null ? reason : ""); + return this; + } + + /** + * Set the source for this instance operation. + * @param source InstanceOperationSource + * that caused this instance operation to be triggered. + */ + public Builder setSource(InstanceConstants.InstanceOperationSource source) { + _properties.put(InstanceOperationProperties.SOURCE.name(), + source == null ? InstanceConstants.InstanceOperationSource.USER.name() + : source.name()); + return this; + } + + public InstanceOperation build() throws IllegalArgumentException { + if (!_properties.containsKey(InstanceOperationProperties.OPERATION.name())) { + throw new IllegalArgumentException( + "Instance operation type is not set, this is a required field."); + } + _properties.put(InstanceOperationProperties.TIMESTAMP.name(), + String.valueOf(System.currentTimeMillis())); + return new InstanceOperation(_properties); + } + } + + /** + * Get the operation type of this instance operation. + * @return the InstanceOperation type + */ + public InstanceConstants.InstanceOperation getOperation() throws IllegalArgumentException { + return InstanceConstants.InstanceOperation.valueOf( + _properties.get(InstanceOperationProperties.OPERATION.name())); + } + + /** + * Get the reason for this instance operation. + * If the reason is not set, it will default to an empty string. + * + * @return the reason for this instance operation. + */ + public String getReason() { + return _properties.getOrDefault(InstanceOperationProperties.REASON.name(), ""); + } + + /** + * Get the InstanceOperationSource + * that caused this instance operation to be triggered. + * If the source is not set, it will default to DEFAULT. + * + * @return the InstanceOperationSource + *that caused this instance operation to be triggered. + */ + public InstanceConstants.InstanceOperationSource getSource() { + return InstanceConstants.InstanceOperationSource.valueOf( + _properties.getOrDefault(InstanceOperationProperties.SOURCE.name(), + InstanceConstants.InstanceOperationSource.USER.name())); + } + + /** + * Get the timestamp (milliseconds from epoch) when this instance operation was triggered. + * + * @return the timestamp when the instance operation was triggered. + */ + public long getTimestamp() { + return Long.parseLong(_properties.get(InstanceOperationProperties.TIMESTAMP.name())); + } + + private void setTimestamp(long timestamp) { + _properties.put(InstanceOperationProperties.TIMESTAMP.name(), String.valueOf(timestamp)); + } + + private Map getProperties() { + return _properties; + } } public static final int WEIGHT_NOT_SET = -1; public static final int MAX_CONCURRENT_TASK_NOT_SET = -1; private static final int TARGET_TASK_THREAD_POOL_SIZE_NOT_SET = -1; private static final boolean HELIX_ENABLED_DEFAULT_VALUE = true; + private static final long HELIX_ENABLED_TIMESTAMP_DEFAULT_VALUE = -1; + private static final ObjectMapper _objectMapper = new ObjectMapper(); + + // These fields are not allowed to be overwritten by the merge method because + // they are unique properties of an instance. + private static final ImmutableSet NON_OVERWRITABLE_PROPERTIES = + ImmutableSet.of(InstanceConfigProperty.HELIX_HOST, InstanceConfigProperty.HELIX_PORT, + InstanceConfigProperty.HELIX_ZONE_ID, InstanceConfigProperty.DOMAIN, + InstanceConfigProperty.INSTANCE_INFO_MAP); private static final Logger _logger = LoggerFactory.getLogger(InstanceConfig.class.getName()); + private List _deserializedInstanceOperations; + /** * Instantiate for a specific instance * @param instanceId the instance identifier @@ -252,29 +380,34 @@ public boolean containsTag(String tag) { } /** - * Check if this instance is enabled and able to serve replicas - * @return true if enabled, false if disabled + * Get the timestamp (milliseconds from epoch) when this instance was enabled/disabled last time. + * + * @return the timestamp when the instance was enabled/disabled last time. If the instance is never + * enabled/disabled, return -1. */ - public boolean getInstanceEnabled() { - return _record.getBooleanField(InstanceConfigProperty.HELIX_ENABLED.name(), - HELIX_ENABLED_DEFAULT_VALUE); + public long getInstanceEnabledTime() { + return _record.getLongField(InstanceConfigProperty.HELIX_ENABLED_TIMESTAMP.name(), + HELIX_ENABLED_TIMESTAMP_DEFAULT_VALUE); } /** - * Set the enabled state of the instance - * If user enables the instance, HELIX_DISABLED_REASON filed will be removed. - * + * Set the enabled state of the instance If user enables the instance, HELIX_DISABLED_REASON filed + * will be removed. * @param enabled true to enable, false to disable + * @deprecated This method is deprecated. Please use setInstanceOperation instead. */ + @Deprecated public void setInstanceEnabled(boolean enabled) { // set instance operation only when we need to change InstanceEnabled value. - setInstanceEnabledHelper(enabled); + setInstanceEnabledHelper(enabled, null); } - private void setInstanceEnabledHelper(boolean enabled) { - _record.setBooleanField(InstanceConfigProperty.HELIX_ENABLED.toString(), enabled); - _record.setLongField(InstanceConfigProperty.HELIX_ENABLED_TIMESTAMP.name(), System.currentTimeMillis()); + private void setInstanceEnabledHelper(boolean enabled, Long timestampOverride) { + _record.setBooleanField(InstanceConfigProperty.HELIX_ENABLED.name(), enabled); + _record.setLongField(InstanceConfigProperty.HELIX_ENABLED_TIMESTAMP.name(), + timestampOverride != null ? timestampOverride : System.currentTimeMillis()); if (enabled) { + // TODO: Replace this when HELIX_ENABLED and HELIX_DISABLED_REASON is removed. resetInstanceDisabledTypeAndReason(); } } @@ -282,6 +415,7 @@ private void setInstanceEnabledHelper(boolean enabled) { /** * Removes HELIX_DISABLED_REASON and HELIX_DISABLED_TYPE entry from simple field. */ + @Deprecated public void resetInstanceDisabledTypeAndReason() { _record.getSimpleFields().remove(InstanceConfigProperty.HELIX_DISABLED_REASON.name()); _record.getSimpleFields().remove(InstanceConfigProperty.HELIX_DISABLED_TYPE.name()); @@ -290,27 +424,36 @@ public void resetInstanceDisabledTypeAndReason() { /** * Set the instance disabled reason when instance is disabled. * It will be a no-op when instance is enabled. + * @deprecated This method is deprecated. Please use . */ + @Deprecated public void setInstanceDisabledReason(String disabledReason) { - if (!getInstanceEnabled()) { - _record.setSimpleField(InstanceConfigProperty.HELIX_DISABLED_REASON.name(), disabledReason); - } + if (getInstanceOperation().getOperation().equals(InstanceConstants.InstanceOperation.DISABLE)) { + _record.setSimpleField(InstanceConfigProperty.HELIX_DISABLED_REASON.name(), disabledReason); + } } /** * Set the instance disabled type when instance is disabled. * It will be a no-op when instance is enabled. + * @deprecated This method is deprecated. Please use setInstanceOperation along with + * InstanceOperation.Builder().setSource + *(...) */ + @Deprecated public void setInstanceDisabledType(InstanceConstants.InstanceDisabledType disabledType) { - if (!getInstanceEnabled()) { + if (getInstanceOperation().getOperation().equals(InstanceConstants.InstanceOperation.DISABLE)) { _record.setSimpleField(InstanceConfigProperty.HELIX_DISABLED_TYPE.name(), disabledType.name()); } } /** + * Get the instance disabled reason when instance is disabled. * @return Return instance disabled reason. Default is am empty string. + * @deprecated This method is deprecated. Please use getInstanceOperation().getReason() instead. */ + @Deprecated public String getInstanceDisabledReason() { return _record.getStringField(InstanceConfigProperty.HELIX_DISABLED_REASON.name(), ""); } @@ -319,31 +462,222 @@ public String getInstanceDisabledReason() { * * @return Return instance disabled type (org.apache.helix.constants.InstanceConstants.InstanceDisabledType) * Default is am empty string. + * @deprecated This method is deprecated. Please use getInstanceOperation().getSource + *() instead. */ + @Deprecated public String getInstanceDisabledType() { - if (getInstanceEnabled()) { + if (_record.getBooleanField(InstanceConfigProperty.HELIX_ENABLED.name(), + HELIX_ENABLED_DEFAULT_VALUE)) { return InstanceConstants.INSTANCE_NOT_DISABLED; } return _record.getStringField(InstanceConfigProperty.HELIX_DISABLED_TYPE.name(), InstanceConstants.InstanceDisabledType.DEFAULT_INSTANCE_DISABLE_TYPE.name()); } + private List getInstanceOperations() { + if (_deserializedInstanceOperations == null || _deserializedInstanceOperations.isEmpty()) { + // If the _deserializedInstanceOperations is not set, then we need to build it from the real + // helix property HELIX_INSTANCE_OPERATIONS. + List instanceOperations = + _record.getListField(InstanceConfigProperty.HELIX_INSTANCE_OPERATIONS.name()); + List newDeserializedInstanceOperations = new ArrayList<>(); + if (instanceOperations != null) { + for (String serializedInstanceOperation : instanceOperations) { + try { + Map properties = _objectMapper.readValue(serializedInstanceOperation, + new TypeReference>() { + }); + newDeserializedInstanceOperations.add(new InstanceOperation(properties)); + } catch (JsonProcessingException e) { + _logger.error( + "Failed to deserialize instance operation for instance: " + _record.getId(), e); + } + } + } + _deserializedInstanceOperations = newDeserializedInstanceOperations; + } + + return _deserializedInstanceOperations; + } + /** - * Get the timestamp (milliseconds from epoch) when this instance was enabled/disabled last time. + * Set the instance operation for this instance. + * This method also sets the HELIX_ENABLED, HELIX_DISABLED_REASON, and HELIX_DISABLED_TYPE fields + * for backwards compatibility. * - * @return + * @param operation the instance operation */ - public long getInstanceEnabledTime() { - return _record.getLongField(InstanceConfigProperty.HELIX_ENABLED_TIMESTAMP.name(), -1); + public void setInstanceOperation(InstanceOperation operation) { + List deserializedInstanceOperations = getInstanceOperations(); + + if (operation.getSource() == InstanceConstants.InstanceOperationSource.ADMIN) { + deserializedInstanceOperations.clear(); + } else { + // Remove the instance operation with the same source if it exists. + deserializedInstanceOperations.removeIf( + instanceOperation -> instanceOperation.getSource() == operation.getSource()); + } + if (operation.getOperation() == InstanceConstants.InstanceOperation.ENABLE) { + // Insert the operation after the last ENABLE or at the beginning if there isn't ENABLE in the list. + int insertIndex = 0; + for (int i = deserializedInstanceOperations.size() - 1; i >= 0; i--) { + if (deserializedInstanceOperations.get(i).getOperation() + == InstanceConstants.InstanceOperation.ENABLE) { + insertIndex = i + 1; + break; + } + } + deserializedInstanceOperations.add(insertIndex, operation); + } else { + deserializedInstanceOperations.add(operation); + } + // Set the actual field in the ZnRecord + _record.setListField(InstanceConfigProperty.HELIX_INSTANCE_OPERATIONS.name(), + deserializedInstanceOperations.stream().map(instanceOperation -> { + try { + return _objectMapper.writeValueAsString(instanceOperation.getProperties()); + } catch (JsonProcessingException e) { + throw new HelixException( + "Failed to serialize instance operation for instance: " + _record.getId() + + " Can't set the instance operation to: " + operation.getOperation(), e); + } + }).collect(Collectors.toList())); + + // TODO: Remove this when we are sure that all users are using the new InstanceOperation only and HELIX_ENABLED is removed. + if (operation.getOperation() == InstanceConstants.InstanceOperation.DISABLE) { + // We are still setting the HELIX_ENABLED field for backwards compatibility. + // It is possible that users will be using earlier version of HelixAdmin or helix-rest + // is on older version. + + if (_record.getBooleanField(InstanceConfigProperty.HELIX_ENABLED.name(), true)) { + // Check if it is already disabled, if yes, then we don't need to set HELIX_ENABLED and HELIX_ENABLED_TIMESTAMP + setInstanceEnabledHelper(false, operation.getTimestamp()); + } + + _record.setSimpleField(InstanceConfigProperty.HELIX_DISABLED_REASON.name(), + operation.getReason()); + _record.setSimpleField(InstanceConfigProperty.HELIX_DISABLED_TYPE.name(), + InstanceConstants.InstanceDisabledType.DEFAULT_INSTANCE_DISABLE_TYPE.name()); + } else if (operation.getOperation() == InstanceConstants.InstanceOperation.ENABLE) { + // If any of the other InstanceOperations are of type DISABLE, set that in the HELIX_ENABLED, + // HELIX_DISABLED_REASON, and HELIX_DISABLED_TYPE fields. + InstanceOperation latestDisableInstanceOperation = null; + for (InstanceOperation instanceOperation : getInstanceOperations()) { + if (instanceOperation.getOperation() == InstanceConstants.InstanceOperation.DISABLE && ( + latestDisableInstanceOperation == null || instanceOperation.getTimestamp() + > latestDisableInstanceOperation.getTimestamp())) { + latestDisableInstanceOperation = instanceOperation; + } + } + + if (latestDisableInstanceOperation != null) { + _record.setSimpleField(InstanceConfigProperty.HELIX_DISABLED_REASON.name(), + latestDisableInstanceOperation.getReason()); + _record.setSimpleField(InstanceConfigProperty.HELIX_DISABLED_TYPE.name(), + InstanceConstants.InstanceDisabledType.DEFAULT_INSTANCE_DISABLE_TYPE.name()); + } else { + setInstanceEnabledHelper(true, operation.getTimestamp()); + } + } } + /** + * Set the instance operation for this instance. Provide the InstanceOperation enum and the reason + * and source will be set to default values. + * + * @param operation the instance operation + */ public void setInstanceOperation(InstanceConstants.InstanceOperation operation) { - _record.setSimpleField(InstanceConfigProperty.INSTANCE_OPERATION.name(), - operation == null ? "" : operation.name()); + InstanceOperation instanceOperation = + new InstanceOperation.Builder().setOperation(operation).build(); + setInstanceOperation(instanceOperation); + } + + private void setInstanceOperationInit(InstanceConstants.InstanceOperation operation) { + if (operation == null) { + return; + } + InstanceOperation instanceOperation = + new InstanceOperation.Builder().setOperation(operation).setReason("INIT").build(); + // When an instance is created for the first time the timestamp is set to -1 so that if it + // is disabled it will not be considered within the delay window when it joins. + instanceOperation.setTimestamp(HELIX_ENABLED_TIMESTAMP_DEFAULT_VALUE); + setInstanceOperation(instanceOperation); } - public String getInstanceOperation() { - return _record.getStringField(InstanceConfigProperty.INSTANCE_OPERATION.name(), ""); + private InstanceOperation getActiveInstanceOperation() { + List instanceOperations = getInstanceOperations(); + + if (instanceOperations.isEmpty()) { + InstanceOperation instanceOperation = + new InstanceOperation.Builder().setOperation(InstanceConstants.InstanceOperation.ENABLE) + .setSource(InstanceConstants.InstanceOperationSource.DEFAULT).build(); + instanceOperation.setTimestamp(HELIX_ENABLED_TIMESTAMP_DEFAULT_VALUE); + return instanceOperation; + } + + // The last instance operation in the list is the most recent one. + // ENABLE operation should not be included in the list. + return instanceOperations.get(instanceOperations.size() - 1); + } + + /** + * Get the InstanceOperationType of this instance, default is ENABLE if nothing is set. If + * HELIX_ENABLED is set to false, then the instance operation is DISABLE for backwards + * compatibility. + * + * @return the instance operation + */ + public InstanceOperation getInstanceOperation() { + InstanceOperation activeInstanceOperation = getActiveInstanceOperation(); + try { + activeInstanceOperation.getOperation(); + } catch (IllegalArgumentException e) { + _logger.error("Invalid instance operation type for instance: " + _record.getId() + + ". You may need to update your version of Helix to get support for this " + + "type of InstanceOperation. Defaulting to UNKNOWN."); + activeInstanceOperation = + new InstanceOperation.Builder().setOperation(InstanceConstants.InstanceOperation.UNKNOWN) + .build(); + } + + // Always respect the HELIX_ENABLED being set to false when instance operation is unset + // for backwards compatibility. + if (!_record.getBooleanField(InstanceConfigProperty.HELIX_ENABLED.name(), + HELIX_ENABLED_DEFAULT_VALUE) + && (InstanceConstants.INSTANCE_DISABLED_OVERRIDABLE_OPERATIONS.contains( + activeInstanceOperation.getOperation()))) { + return new InstanceOperation.Builder().setOperation( + InstanceConstants.InstanceOperation.DISABLE).setReason(getInstanceDisabledReason()) + .setSource( + InstanceConstants.InstanceOperationSource.instanceDisabledTypeToInstanceOperationSource( + InstanceConstants.InstanceDisabledType.valueOf(getInstanceDisabledType()))) + .build(); + } + + return activeInstanceOperation; + } + + /** + * Check if this instance is enabled. This is used to determine if the instance can host online + * replicas and take new assignment. + * + * @return true if enabled, false otherwise + */ + public boolean getInstanceEnabled() { + return getInstanceOperation().getOperation().equals(InstanceConstants.InstanceOperation.ENABLE); + } + + /** + * Check to see if the instance is assignable. This is used to determine if the instance can be + * selected by the rebalancer to take assignment of replicas. + * + * @return true if the instance is assignable, false otherwise + */ + public boolean isAssignable() { + return InstanceConstants.ASSIGNABLE_INSTANCE_OPERATIONS.contains( + getInstanceOperation().getOperation()); } /** @@ -777,6 +1111,34 @@ public boolean validateTopologySettingInInstanceConfig(ClusterConfig clusterConf return true; } + /** + * Overwrite the InstanceConfigProperties from the given InstanceConfig to this InstanceConfig. + * The merge is done by overwriting the properties in this InstanceConfig with the properties + * from the given InstanceConfig. {@link #NON_OVERWRITABLE_PROPERTIES} will not be overridden. + * + * @param overwritingInstanceConfig the InstanceConfig to override into this InstanceConfig + */ + public void overwriteInstanceConfig(InstanceConfig overwritingInstanceConfig) { + // Remove all overwritable fields from the record + Set overwritableProperties = Arrays.stream(InstanceConfigProperty.values()) + .filter(property -> !NON_OVERWRITABLE_PROPERTIES.contains(property)).map(Enum::name) + .collect(Collectors.toSet()); + _record.getSimpleFields().keySet().removeAll(overwritableProperties); + _record.getListFields().keySet().removeAll(overwritableProperties); + _record.getMapFields().keySet().removeAll(overwritableProperties); + + // Get all overwritable fields from the overwritingInstanceConfig and set them in this record + overwritingInstanceConfig.getRecord().getSimpleFields().entrySet().stream() + .filter(entry -> overwritableProperties.contains(entry.getKey())) + .forEach((entry) -> _record.setSimpleField(entry.getKey(), entry.getValue())); + overwritingInstanceConfig.getRecord().getListFields().entrySet().stream() + .filter(entry -> overwritableProperties.contains(entry.getKey())) + .forEach((entry) -> _record.setListField(entry.getKey(), entry.getValue())); + overwritingInstanceConfig.getRecord().getMapFields().entrySet().stream() + .filter(entry -> overwritableProperties.contains(entry.getKey())) + .forEach((entry) -> _record.setMapField(entry.getKey(), entry.getValue())); + } + public static class Builder { private String _hostName; private String _port; @@ -828,12 +1190,13 @@ public InstanceConfig build(String instanceId) { instanceConfig.addTag(tag); } - if (_instanceEnabled != HELIX_ENABLED_DEFAULT_VALUE) { - instanceConfig.setInstanceEnabled(_instanceEnabled); + if (_instanceOperation == null && !_instanceEnabled) { + instanceConfig.setInstanceOperationInit(InstanceConstants.InstanceOperation.DISABLE); } - if (_instanceOperation != null) { - instanceConfig.setInstanceOperation(_instanceOperation); + if (_instanceOperation != null && !_instanceOperation.equals( + InstanceConstants.InstanceOperation.ENABLE)) { + instanceConfig.setInstanceOperationInit(_instanceOperation); } if (_instanceInfoMap != null) { @@ -899,9 +1262,11 @@ public Builder addTag(String tag) { /** * Set the enabled status for this instance + * @deprecated HELIX_ENABLED is no longer in use. Use setInstanceOperation instead. * @param instanceEnabled true if enabled, false otherwise * @return InstanceConfig.Builder */ + @Deprecated public Builder setInstanceEnabled(boolean instanceEnabled) { _instanceEnabled = instanceEnabled; return this; diff --git a/helix-core/src/main/java/org/apache/helix/model/MaintenanceSignal.java b/helix-core/src/main/java/org/apache/helix/model/MaintenanceSignal.java index f978130b7b..83e0e1c604 100644 --- a/helix-core/src/main/java/org/apache/helix/model/MaintenanceSignal.java +++ b/helix-core/src/main/java/org/apache/helix/model/MaintenanceSignal.java @@ -50,7 +50,9 @@ public enum TriggeringEntity { * maintenance mode. This field does not apply when triggered manually. */ public enum AutoTriggerReason { + @Deprecated // Replaced with MAX_INSTANCES_UNABLE_TO_ACCEPT_ONLINE_REPLICAS MAX_OFFLINE_INSTANCES_EXCEEDED, + MAX_INSTANCES_UNABLE_TO_ACCEPT_ONLINE_REPLICAS, MAX_PARTITION_PER_INSTANCE_EXCEEDED, NOT_APPLICABLE // Not triggered automatically or automatically exiting maintenance mode } diff --git a/helix-core/src/main/java/org/apache/helix/monitoring/mbeans/ResourceMonitor.java b/helix-core/src/main/java/org/apache/helix/monitoring/mbeans/ResourceMonitor.java index 5064c64812..012513ffcd 100644 --- a/helix-core/src/main/java/org/apache/helix/monitoring/mbeans/ResourceMonitor.java +++ b/helix-core/src/main/java/org/apache/helix/monitoring/mbeans/ResourceMonitor.java @@ -272,17 +272,9 @@ public void updateResourceState(ExternalView externalView, IdealState idealState long numOfPartitionWithTopState = 0; Set partitions = idealState.getPartitionSet(); - int replica; - try { - replica = Integer.valueOf(idealState.getReplicas()); - } catch (NumberFormatException e) { - _logger.info("Unspecified replica count for {}, skip updating the ResourceMonitor Mbean: {}", _resourceName, - idealState.getReplicas()); - return; - } catch (Exception ex) { - _logger.warn("Failed to get replica count for {}, cannot update the ResourceMonitor Mbean.", _resourceName); - return; - } + + // returns -1 when replica is set to ANY_LIVEINSTANCE. + int replica = idealState.getReplicaCount(-1); int minActiveReplica = idealState.getMinActiveReplicas(); minActiveReplica = (minActiveReplica >= 0) ? minActiveReplica : replica; diff --git a/helix-core/src/main/java/org/apache/helix/participant/statemachine/StateModel.java b/helix-core/src/main/java/org/apache/helix/participant/statemachine/StateModel.java index 143c14adef..5bb2a19c86 100644 --- a/helix-core/src/main/java/org/apache/helix/participant/statemachine/StateModel.java +++ b/helix-core/src/main/java/org/apache/helix/participant/statemachine/StateModel.java @@ -115,4 +115,15 @@ public void cancel() { public boolean isCancelled() { return _cancelled; } + + /* + * default transition to set partition in any state to error state + * @param message + * @param context + * @throws InterruptedException + */ + @Transition(to = "ERROR", from = "*") + public void onBecomeErrorFromAny(Message message, NotificationContext context) throws Exception { + logger.info("Default *->ERROR transition invoked."); + } } diff --git a/helix-core/src/main/java/org/apache/helix/spectator/RoutingDataCache.java b/helix-core/src/main/java/org/apache/helix/spectator/RoutingDataCache.java index 8872e9edac..f634aac46e 100644 --- a/helix-core/src/main/java/org/apache/helix/spectator/RoutingDataCache.java +++ b/helix-core/src/main/java/org/apache/helix/spectator/RoutingDataCache.java @@ -26,7 +26,6 @@ import java.util.stream.Collectors; import com.google.common.collect.ImmutableMap; -import com.google.common.collect.ImmutableSet; import org.apache.helix.HelixConstants; import org.apache.helix.HelixDataAccessor; import org.apache.helix.HelixException; @@ -45,16 +44,13 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; + /** * Cache the cluster data that are needed by RoutingTableProvider. */ class RoutingDataCache extends BasicClusterDataCache { private static Logger LOG = LoggerFactory.getLogger(RoutingDataCache.class.getName()); - // When an instance has any of these instance operations, it should not be routable. - private static final ImmutableSet NON_ROUTABLE_INSTANCE_OPERATIONS = - ImmutableSet.of(InstanceConstants.InstanceOperation.SWAP_IN.name()); - private final Map> _sourceDataTypeMap; private CurrentStateCache _currentStateCache; @@ -185,8 +181,8 @@ public synchronized void refresh(HelixDataAccessor accessor) { private void updateRoutableInstanceConfigMap(Map instanceConfigMap) { _routableInstanceConfigMap = instanceConfigMap.entrySet().stream().filter( - (instanceConfigEntry) -> !NON_ROUTABLE_INSTANCE_OPERATIONS.contains( - instanceConfigEntry.getValue().getInstanceOperation())) + (instanceConfigEntry) -> !InstanceConstants.UNROUTABLE_INSTANCE_OPERATIONS.contains( + instanceConfigEntry.getValue().getInstanceOperation().getOperation())) .collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue)); } @@ -194,8 +190,9 @@ private void updateRoutableLiveInstanceMap(Map instanceC Map liveInstanceMap) { _routableLiveInstanceMap = liveInstanceMap.entrySet().stream().filter( (liveInstanceEntry) -> instanceConfigMap.containsKey(liveInstanceEntry.getKey()) - && !NON_ROUTABLE_INSTANCE_OPERATIONS.contains( - instanceConfigMap.get(liveInstanceEntry.getKey()).getInstanceOperation())) + && !InstanceConstants.UNROUTABLE_INSTANCE_OPERATIONS.contains( + instanceConfigMap.get(liveInstanceEntry.getKey()).getInstanceOperation() + .getOperation())) .collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue)); } diff --git a/helix-core/src/main/java/org/apache/helix/task/JobDispatcher.java b/helix-core/src/main/java/org/apache/helix/task/JobDispatcher.java index fd22a8e1fd..6d4c687fcc 100644 --- a/helix-core/src/main/java/org/apache/helix/task/JobDispatcher.java +++ b/helix-core/src/main/java/org/apache/helix/task/JobDispatcher.java @@ -141,8 +141,8 @@ public ResourceAssignment processJobStatusUpdateAndAssignment(String jobName, // Will contain the list of partitions that must be explicitly dropped from the ideal state that // is stored in zk. Set liveInstances = - jobCfg.getInstanceGroupTag() == null ? _dataProvider.getAssignableEnabledLiveInstances() - : _dataProvider.getAssignableEnabledLiveInstancesWithTag(jobCfg.getInstanceGroupTag()); + jobCfg.getInstanceGroupTag() == null ? _dataProvider.getEnabledLiveInstances() + : _dataProvider.getEnabledLiveInstancesWithTag(jobCfg.getInstanceGroupTag()); if (liveInstances.isEmpty()) { LOG.error("No available instance found for job: {}", jobName); diff --git a/helix-core/src/main/java/org/apache/helix/tools/ClusterSetup.java b/helix-core/src/main/java/org/apache/helix/tools/ClusterSetup.java index 0d9a4f1ac2..a9578632b0 100644 --- a/helix-core/src/main/java/org/apache/helix/tools/ClusterSetup.java +++ b/helix-core/src/main/java/org/apache/helix/tools/ClusterSetup.java @@ -134,6 +134,9 @@ public class ClusterSetup { public static final String resetInstance = "resetInstance"; public static final String resetResource = "resetResource"; + // set partitions to ERROR + public static final String setPartitionsToError = "setPartitionsToError"; + // help public static final String help = "help"; @@ -313,7 +316,7 @@ public void dropInstanceFromCluster(String clusterName, String instanceId) { ClusterConfig clusterConfig = accessor.getProperty(keyBuilder.clusterConfig()); // ensure node is disabled, otherwise fail - if (InstanceValidationUtil.isInstanceEnabled(config, clusterConfig)) { + if (config.getInstanceEnabled()) { String error = "Node " + instanceId + " is enabled, cannot drop"; _logger.warn(error); throw new HelixException(error); @@ -1114,6 +1117,13 @@ private static Options constructCommandLineOptions() { removeCloudConfigOption.setRequired(false); removeCloudConfigOption.setArgName("clusterName"); + Option setPartitionsToErrorOption = + OptionBuilder.withLongOpt(setPartitionsToError) + .withDescription("Set a Partition to Error State").create(); + setPartitionsToErrorOption.setArgs(4); + setPartitionsToErrorOption.setRequired(false); + setPartitionsToErrorOption.setArgName("clusterName instanceName resourceName partitionName"); + OptionGroup group = new OptionGroup(); group.setRequired(true); group.addOption(rebalanceOption); @@ -1153,6 +1163,7 @@ private static Options constructCommandLineOptions() { group.addOption(listStateModelOption); group.addOption(addResourcePropertyOption); group.addOption(removeResourcePropertyOption); + group.addOption(setPartitionsToErrorOption); // set/get/remove config options group.addOption(setConfOption); @@ -1561,6 +1572,16 @@ public static int processCommandLineArgs(String[] cliArgs) throws Exception { String newInstanceName = cmd.getOptionValues(swapInstance)[2]; setupTool.swapInstance(clusterName, oldInstanceName, newInstanceName); + } else if (cmd.hasOption(setPartitionsToError)) { + String[] args = cmd.getOptionValues(setPartitionsToError); + + String clusterName = args[0]; + String instanceName = args[1]; + String resourceName = args[2]; + List partitionNames = Arrays.asList(Arrays.copyOfRange(args, 3, args.length)); + + setupTool.getClusterManagementTool().setPartitionsToError(clusterName, instanceName, resourceName, partitionNames); + return 0; } // set/get/remove config options else if (cmd.hasOption(setConfig)) { diff --git a/helix-core/src/main/java/org/apache/helix/tools/ClusterVerifiers/StrictMatchExternalViewVerifier.java b/helix-core/src/main/java/org/apache/helix/tools/ClusterVerifiers/StrictMatchExternalViewVerifier.java index d0da9ba8eb..7ed44f825c 100644 --- a/helix-core/src/main/java/org/apache/helix/tools/ClusterVerifiers/StrictMatchExternalViewVerifier.java +++ b/helix-core/src/main/java/org/apache/helix/tools/ClusterVerifiers/StrictMatchExternalViewVerifier.java @@ -331,7 +331,7 @@ private Map> computeIdealPartitionState( for (String partition : idealState.getPartitionSet()) { List preferenceList = AbstractRebalancer.getPreferenceList(new Partition(partition), - idealState, cache.getAssignableEnabledLiveInstances()); + idealState, cache.getEnabledLiveInstances()); Map idealMapping; if (_isDeactivatedNodeAware) { idealMapping = HelixUtil.computeIdealMapping(preferenceList, stateModelDef, @@ -339,7 +339,7 @@ private Map> computeIdealPartitionState( cache.getDisabledInstancesForPartition(idealState.getResourceName(), partition)); } else { idealMapping = HelixUtil.computeIdealMapping(preferenceList, stateModelDef, - cache.getAssignableEnabledLiveInstances(), + cache.getEnabledLiveInstances(), Collections.emptySet()); } idealPartitionState.put(partition, idealMapping); diff --git a/helix-core/src/main/java/org/apache/helix/util/HelixUtil.java b/helix-core/src/main/java/org/apache/helix/util/HelixUtil.java index 4a3d49b73a..834b846783 100644 --- a/helix-core/src/main/java/org/apache/helix/util/HelixUtil.java +++ b/helix-core/src/main/java/org/apache/helix/util/HelixUtil.java @@ -399,7 +399,7 @@ public static Map> getIdealAssignmentForFullAuto( // Remove all disabled instances so that Helix will not consider them live. List disabledInstance = instanceConfigs.stream() - .filter(instanceConfig -> !InstanceValidationUtil.isInstanceEnabled(instanceConfig, clusterConfig)) + .filter(instanceConfig -> !instanceConfig.getInstanceEnabled()) .map(InstanceConfig::getInstanceName) .collect(Collectors.toList()); liveInstances.removeAll(disabledInstance); diff --git a/helix-core/src/main/java/org/apache/helix/util/InstanceUtil.java b/helix-core/src/main/java/org/apache/helix/util/InstanceUtil.java new file mode 100644 index 0000000000..967d561e74 --- /dev/null +++ b/helix-core/src/main/java/org/apache/helix/util/InstanceUtil.java @@ -0,0 +1,198 @@ +package org.apache.helix.util; + +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +import java.util.List; +import java.util.Map; +import java.util.function.Function; +import java.util.stream.Collectors; + +import com.google.common.collect.ImmutableMap; +import org.apache.helix.AccessOption; +import org.apache.helix.BaseDataAccessor; +import org.apache.helix.ConfigAccessor; +import org.apache.helix.HelixException; +import org.apache.helix.PropertyPathBuilder; +import org.apache.helix.constants.InstanceConstants; +import org.apache.helix.model.ClusterTopologyConfig; +import org.apache.helix.model.HelixConfigScope; +import org.apache.helix.model.InstanceConfig; +import org.apache.helix.model.builder.HelixConfigScopeBuilder; +import org.apache.helix.zookeeper.datamodel.ZNRecord; +import org.apache.helix.zookeeper.zkclient.DataUpdater; + +public class InstanceUtil { + + // Private constructor to prevent instantiation + private InstanceUtil() { + } + + // Validators for instance operation transitions + private static final Function, Boolean> ALWAYS_ALLOWED = + (matchingInstances) -> true; + private static final Function, Boolean> ALL_MATCHES_ARE_UNKNOWN = + (matchingInstances) -> matchingInstances.isEmpty() || matchingInstances.stream().allMatch( + instance -> instance.getInstanceOperation().getOperation() + .equals(InstanceConstants.InstanceOperation.UNKNOWN)); + private static final Function, Boolean> ALL_MATCHES_ARE_UNKNOWN_OR_EVACUATE = + (matchingInstances) -> matchingInstances.isEmpty() || matchingInstances.stream().allMatch( + instance -> instance.getInstanceOperation().getOperation() + .equals(InstanceConstants.InstanceOperation.UNKNOWN) + || instance.getInstanceOperation().getOperation() + .equals(InstanceConstants.InstanceOperation.EVACUATE)); + private static final Function, Boolean> ANY_MATCH_ENABLE_OR_DISABLE = + (matchingInstances) -> !matchingInstances.isEmpty() && matchingInstances.stream().anyMatch( + instance -> instance.getInstanceOperation().getOperation() + .equals(InstanceConstants.InstanceOperation.ENABLE) || instance.getInstanceOperation() + .getOperation().equals(InstanceConstants.InstanceOperation.DISABLE)); + + // Validator map for valid instance operation transitions :: + private static final ImmutableMap, Boolean>>> + validInstanceOperationTransitions = + ImmutableMap.of(InstanceConstants.InstanceOperation.ENABLE, + // ENABLE and DISABLE can be set to UNKNOWN when matching instance is in SWAP_IN and set to ENABLE in a transaction. + ImmutableMap.of(InstanceConstants.InstanceOperation.ENABLE, ALWAYS_ALLOWED, + InstanceConstants.InstanceOperation.DISABLE, ALWAYS_ALLOWED, + InstanceConstants.InstanceOperation.EVACUATE, ALWAYS_ALLOWED, + InstanceConstants.InstanceOperation.UNKNOWN, ALWAYS_ALLOWED), + InstanceConstants.InstanceOperation.DISABLE, + ImmutableMap.of(InstanceConstants.InstanceOperation.DISABLE, ALWAYS_ALLOWED, + InstanceConstants.InstanceOperation.ENABLE, ALWAYS_ALLOWED, + InstanceConstants.InstanceOperation.EVACUATE, ALWAYS_ALLOWED, + InstanceConstants.InstanceOperation.UNKNOWN, ALWAYS_ALLOWED), + InstanceConstants.InstanceOperation.SWAP_IN, + // SWAP_IN can be set to ENABLE when matching instance is in UNKNOWN state in a transaction. + ImmutableMap.of(InstanceConstants.InstanceOperation.SWAP_IN, ALWAYS_ALLOWED, + InstanceConstants.InstanceOperation.UNKNOWN, ALWAYS_ALLOWED), + InstanceConstants.InstanceOperation.EVACUATE, + ImmutableMap.of(InstanceConstants.InstanceOperation.EVACUATE, ALWAYS_ALLOWED, + InstanceConstants.InstanceOperation.ENABLE, ALL_MATCHES_ARE_UNKNOWN, + InstanceConstants.InstanceOperation.DISABLE, ALL_MATCHES_ARE_UNKNOWN, + InstanceConstants.InstanceOperation.UNKNOWN, ALWAYS_ALLOWED), + InstanceConstants.InstanceOperation.UNKNOWN, + ImmutableMap.of(InstanceConstants.InstanceOperation.UNKNOWN, ALWAYS_ALLOWED, + InstanceConstants.InstanceOperation.ENABLE, ALL_MATCHES_ARE_UNKNOWN_OR_EVACUATE, + InstanceConstants.InstanceOperation.DISABLE, ALL_MATCHES_ARE_UNKNOWN_OR_EVACUATE, + InstanceConstants.InstanceOperation.SWAP_IN, ANY_MATCH_ENABLE_OR_DISABLE)); + + /** + * Validates if the transition from the current operation to the target operation is valid. + * + * @param configAccessor The ConfigAccessor instance + * @param clusterName The cluster name + * @param instanceConfig The current instance configuration + * @param currentOperation The current operation + * @param targetOperation The target operation + */ + public static void validateInstanceOperationTransition(ConfigAccessor configAccessor, + String clusterName, InstanceConfig instanceConfig, + InstanceConstants.InstanceOperation currentOperation, + InstanceConstants.InstanceOperation targetOperation) { + // Check if the current operation and target operation are in the valid transitions map + if (!validInstanceOperationTransitions.containsKey(currentOperation) + || !validInstanceOperationTransitions.get(currentOperation).containsKey(targetOperation)) { + throw new HelixException( + "Invalid instance operation transition from " + currentOperation + " to " + + targetOperation); + } + + // Throw exception if the validation fails + if (!validInstanceOperationTransitions.get(currentOperation).get(targetOperation) + .apply(findInstancesWithMatchingLogicalId(configAccessor, clusterName, instanceConfig))) { + throw new HelixException( + "Failed validation for instance operation transition from " + currentOperation + " to " + + targetOperation); + } + } + + /** + * Finds the instances that have a matching logical ID with the given instance. + * + * @param configAccessor The ConfigAccessor instance + * @param clusterName The cluster name + * @param instanceConfig The instance configuration to match + * @return A list of matching instances + */ + public static List findInstancesWithMatchingLogicalId( + ConfigAccessor configAccessor, String clusterName, InstanceConfig instanceConfig) { + String logicalIdKey = + ClusterTopologyConfig.createFromClusterConfig(configAccessor.getClusterConfig(clusterName)) + .getEndNodeType(); + + // Retrieve and filter instances with matching logical ID + return configAccessor.getKeys( + new HelixConfigScopeBuilder(HelixConfigScope.ConfigScopeProperty.PARTICIPANT, + clusterName).build()).stream() + .map(instanceName -> configAccessor.getInstanceConfig(clusterName, instanceName)).filter( + potentialInstanceConfig -> + !potentialInstanceConfig.getInstanceName().equals(instanceConfig.getInstanceName()) + && potentialInstanceConfig.getLogicalId(logicalIdKey) + .equals(instanceConfig.getLogicalId(logicalIdKey))) + .collect(Collectors.toList()); + } + + /** + * Sets the instance operation for the given instance. + * + * @param configAccessor The ConfigAccessor instance + * @param baseAccessor The BaseDataAccessor instance + * @param clusterName The cluster name + * @param instanceName The instance name + * @param instanceOperation The instance operation to set + */ + public static void setInstanceOperation(ConfigAccessor configAccessor, + BaseDataAccessor baseAccessor, String clusterName, String instanceName, + InstanceConfig.InstanceOperation instanceOperation) { + String path = PropertyPathBuilder.instanceConfig(clusterName, instanceName); + + // Retrieve the current instance configuration + InstanceConfig instanceConfig = configAccessor.getInstanceConfig(clusterName, instanceName); + if (instanceConfig == null) { + throw new HelixException("Cluster " + clusterName + ", instance: " + instanceName + + ", instance config does not exist"); + } + + // Validate the instance operation transition + validateInstanceOperationTransition(configAccessor, clusterName, instanceConfig, + instanceConfig.getInstanceOperation().getOperation(), + instanceOperation == null ? InstanceConstants.InstanceOperation.ENABLE + : instanceOperation.getOperation()); + + // Update the instance operation + boolean succeeded = baseAccessor.update(path, new DataUpdater() { + @Override + public ZNRecord update(ZNRecord currentData) { + if (currentData == null) { + throw new HelixException("Cluster: " + clusterName + ", instance: " + instanceName + + ", participant config is null"); + } + + InstanceConfig config = new InstanceConfig(currentData); + config.setInstanceOperation(instanceOperation); + return config.getRecord(); + } + }, AccessOption.PERSISTENT); + + if (!succeeded) { + throw new HelixException( + "Failed to update instance operation. Please check if instance is disabled."); + } + } +} diff --git a/helix-core/src/main/java/org/apache/helix/util/InstanceValidationUtil.java b/helix-core/src/main/java/org/apache/helix/util/InstanceValidationUtil.java index 2542ecf7fb..5dea683346 100644 --- a/helix-core/src/main/java/org/apache/helix/util/InstanceValidationUtil.java +++ b/helix-core/src/main/java/org/apache/helix/util/InstanceValidationUtil.java @@ -73,22 +73,22 @@ private InstanceValidationUtil() { public static boolean isEnabled(HelixDataAccessor dataAccessor, String instanceName) { PropertyKey.Builder propertyKeyBuilder = dataAccessor.keyBuilder(); InstanceConfig instanceConfig = dataAccessor.getProperty(propertyKeyBuilder.instanceConfig(instanceName)); - ClusterConfig clusterConfig = dataAccessor.getProperty(propertyKeyBuilder.clusterConfig()); - // TODO deprecate instance level config checks once migrated the enable status to cluster config only - if (instanceConfig == null || clusterConfig == null) { - throw new HelixException("InstanceConfig or ClusterConfig is NULL"); + if (instanceConfig == null) { + throw new HelixException("InstanceConfig is NULL"); } - return isInstanceEnabled(instanceConfig, clusterConfig); - + return instanceConfig.getInstanceEnabled(); } /** * Check if the instance is enabled by configuration + * @deprecated Use {@link InstanceConfig#getInstanceEnabled()} instead. We will no longer + * be using cluster config to enable/disable instances. * @param instanceConfig * @param clusterConfig * @return */ + @Deprecated public static boolean isInstanceEnabled(InstanceConfig instanceConfig, ClusterConfig clusterConfig) { if (instanceConfig == null) { throw new HelixException("InstanceConfig is NULL"); diff --git a/helix-core/src/main/java/org/apache/helix/util/WeightAwareRebalanceUtil.java b/helix-core/src/main/java/org/apache/helix/util/WeightAwareRebalanceUtil.java index bc439549fe..18ddc0283d 100644 --- a/helix-core/src/main/java/org/apache/helix/util/WeightAwareRebalanceUtil.java +++ b/helix-core/src/main/java/org/apache/helix/util/WeightAwareRebalanceUtil.java @@ -29,6 +29,7 @@ import org.apache.helix.api.config.RebalanceConfig; import org.apache.helix.api.rebalancer.constraint.AbstractRebalanceHardConstraint; import org.apache.helix.api.rebalancer.constraint.AbstractRebalanceSoftConstraint; +import org.apache.helix.constants.InstanceConstants; import org.apache.helix.controller.common.PartitionStateMap; import org.apache.helix.controller.common.ResourcesStateMap; import org.apache.helix.controller.dataproviders.ResourceControllerDataProvider; @@ -77,7 +78,7 @@ public WeightAwareRebalanceUtil(ClusterConfig clusterConfig, List instanceConfigs) { for (InstanceConfig instanceConfig : instanceConfigs) { // ensure the instance is enabled - instanceConfig.setInstanceEnabled(true); + instanceConfig.setInstanceOperation(InstanceConstants.InstanceOperation.ENABLE); _instanceConfigMap.put(instanceConfig.getInstanceName(), instanceConfig); } // ensure no instance is disabled diff --git a/helix-core/src/test/java/org/apache/helix/cloud/event/TestDefaultCloudEventCallbackImpl.java b/helix-core/src/test/java/org/apache/helix/cloud/event/TestDefaultCloudEventCallbackImpl.java index bf9b59dc2d..5ea42dc3e7 100644 --- a/helix-core/src/test/java/org/apache/helix/cloud/event/TestDefaultCloudEventCallbackImpl.java +++ b/helix-core/src/test/java/org/apache/helix/cloud/event/TestDefaultCloudEventCallbackImpl.java @@ -52,20 +52,18 @@ public void testDisableInstance() { Assert.assertFalse(InstanceValidationUtil .isEnabled(_manager.getHelixDataAccessor(), _instanceManager.getInstanceName())); Assert.assertEquals(_manager.getConfigAccessor() - .getInstanceConfig(CLUSTER_NAME, _instanceManager.getInstanceName()) - .getInstanceDisabledType(), InstanceConstants.InstanceDisabledType.CLOUD_EVENT.name()); + .getInstanceConfig(CLUSTER_NAME, _instanceManager.getInstanceName()).getInstanceOperation() + .getSource(), InstanceConstants.InstanceOperationSource.AUTOMATION); - // Should not disable instance if it is already disabled due to other reasons - // And disabled type should remain unchanged _admin.enableInstance(CLUSTER_NAME, _instanceManager.getInstanceName(), false); _impl.disableInstance(_instanceManager, null); Assert.assertFalse(InstanceValidationUtil .isEnabled(_manager.getHelixDataAccessor(), _instanceManager.getInstanceName())); Assert.assertEquals(_manager.getConfigAccessor() .getInstanceConfig(CLUSTER_NAME, _instanceManager.getInstanceName()) - .getInstanceDisabledType(), - InstanceConstants.InstanceDisabledType.DEFAULT_INSTANCE_DISABLE_TYPE.name()); + .getInstanceOperation().getSource(), InstanceConstants.InstanceOperationSource.USER); + _admin.enableInstance(CLUSTER_NAME, _instanceManager.getInstanceName(), true); _admin.enableInstance(CLUSTER_NAME, _instanceManager.getInstanceName(), false, InstanceConstants.InstanceDisabledType.CLOUD_EVENT, null); } diff --git a/helix-core/src/test/java/org/apache/helix/common/ZkTestBase.java b/helix-core/src/test/java/org/apache/helix/common/ZkTestBase.java index 0218c3ffcb..a265605185 100644 --- a/helix-core/src/test/java/org/apache/helix/common/ZkTestBase.java +++ b/helix-core/src/test/java/org/apache/helix/common/ZkTestBase.java @@ -45,6 +45,7 @@ import org.apache.helix.SystemPropertyKeys; import org.apache.helix.TestHelper; import org.apache.helix.api.config.HelixConfigProperty; +import org.apache.helix.constants.InstanceConstants; import org.apache.helix.controller.pipeline.AbstractAsyncBaseStage; import org.apache.helix.controller.pipeline.Pipeline; import org.apache.helix.controller.pipeline.Stage; @@ -633,7 +634,6 @@ protected List setupLiveInstances(String clusterName, int[] liveIn for (int i = 0; i < liveInstances.length; i++) { String instance = "localhost_" + liveInstances[i]; - _liveInstanceOwners.putIfAbsent(clusterName, new HashMap<>()); Map clientMap = _liveInstanceOwners.get(clusterName); clientMap.putIfAbsent(instance, DedicatedZkClientFactory.getInstance() @@ -687,7 +687,7 @@ protected void setupInstances(String clusterName, int[] instances) { InstanceConfig instanceConfig = new InstanceConfig(instance); instanceConfig.setHostName("localhost"); instanceConfig.setPort("" + instances[i]); - instanceConfig.setInstanceEnabled(true); + instanceConfig.setInstanceOperation(InstanceConstants.InstanceOperation.ENABLE); admin.addInstance(clusterName, instanceConfig); } } diff --git a/helix-core/src/test/java/org/apache/helix/controller/changedetector/TestResourceChangeDetector.java b/helix-core/src/test/java/org/apache/helix/controller/changedetector/TestResourceChangeDetector.java index b4f405c745..bdfa2784ba 100644 --- a/helix-core/src/test/java/org/apache/helix/controller/changedetector/TestResourceChangeDetector.java +++ b/helix-core/src/test/java/org/apache/helix/controller/changedetector/TestResourceChangeDetector.java @@ -31,6 +31,7 @@ import org.apache.helix.PropertyKey; import org.apache.helix.TestHelper; import org.apache.helix.common.ZkTestBase; +import org.apache.helix.constants.InstanceConstants; import org.apache.helix.controller.dataproviders.ResourceControllerDataProvider; import org.apache.helix.integration.manager.ClusterControllerManager; import org.apache.helix.integration.manager.MockParticipantManager; @@ -399,7 +400,7 @@ public void testIgnoreNonTopologyChanges() { _dataAccessor.getProperty(_keyBuilder.instanceConfig(instanceName)); Assert.assertTrue(instanceConfig.getInstanceEnabled()); try { - instanceConfig.setInstanceEnabled(false); + instanceConfig.setInstanceOperation(InstanceConstants.InstanceOperation.DISABLE); _dataAccessor.updateProperty(_keyBuilder.instanceConfig(instanceName), instanceConfig); _dataProvider.notifyDataChange(ChangeType.INSTANCE_CONFIG); _dataProvider.refresh(_dataAccessor); @@ -410,7 +411,7 @@ public void testIgnoreNonTopologyChanges() { } finally { // remove newly added resource/ideastate _gSetupTool.getClusterManagementTool().dropResource(CLUSTER_NAME, resourceName); - instanceConfig.setInstanceEnabled(true); + instanceConfig.setInstanceOperation(InstanceConstants.InstanceOperation.ENABLE); _dataAccessor.updateProperty(_keyBuilder.instanceConfig(instanceName), instanceConfig); } } diff --git a/helix-core/src/test/java/org/apache/helix/controller/rebalancer/TestAbstractRebalancer.java b/helix-core/src/test/java/org/apache/helix/controller/rebalancer/TestAbstractRebalancer.java index 45c35f3605..f8af3ee419 100644 --- a/helix-core/src/test/java/org/apache/helix/controller/rebalancer/TestAbstractRebalancer.java +++ b/helix-core/src/test/java/org/apache/helix/controller/rebalancer/TestAbstractRebalancer.java @@ -56,6 +56,8 @@ public void testComputeBestPossibleState(String comment, String stateModelName, new IdealState("test"), new ClusterConfig("TestCluster"), partition, MonitoredAbnormalResolver.DUMMY_STATE_RESOLVER); + System.out.println("Expected best possible state map: " + expectedBestPossibleMap); + System.out.println("Actual best possible state map: " + bestPossibleMap); Assert.assertTrue(bestPossibleMap.equals(expectedBestPossibleMap)); } diff --git a/helix-core/src/test/java/org/apache/helix/controller/rebalancer/waged/TestWagedRebalancer.java b/helix-core/src/test/java/org/apache/helix/controller/rebalancer/waged/TestWagedRebalancer.java index 951e0e3c52..a554283311 100644 --- a/helix-core/src/test/java/org/apache/helix/controller/rebalancer/waged/TestWagedRebalancer.java +++ b/helix-core/src/test/java/org/apache/helix/controller/rebalancer/waged/TestWagedRebalancer.java @@ -118,9 +118,9 @@ protected ResourceControllerDataProvider setupClusterDataCache() throws IOExcept liveInstanceMap.put(instanceName, testLiveInstance); when(testCache.getAssignableLiveInstances()).thenReturn(liveInstanceMap); when(testCache.getLiveInstances()).thenReturn(liveInstanceMap); - when(testCache.getAssignableEnabledInstances()).thenReturn(liveInstanceMap.keySet()); when(testCache.getEnabledInstances()).thenReturn(liveInstanceMap.keySet()); - when(testCache.getAssignableEnabledLiveInstances()).thenReturn(liveInstanceMap.keySet()); + when(testCache.getEnabledInstances()).thenReturn(liveInstanceMap.keySet()); + when(testCache.getEnabledLiveInstances()).thenReturn(liveInstanceMap.keySet()); when(testCache.getEnabledLiveInstances()).thenReturn(liveInstanceMap.keySet()); when(testCache.getAssignableInstances()).thenReturn(_instances); when(testCache.getAllInstances()).thenReturn(_instances); @@ -375,7 +375,7 @@ public void testInvalidClusterStatus() throws IOException, HelixRebalanceExcepti Collectors.toMap(resourceName -> resourceName, Resource::new)); try { rebalancer.computeBestPossibleAssignment(clusterData, resourceMap, - clusterData.getAssignableEnabledLiveInstances(), new CurrentStateOutput(), _algorithm); + clusterData.getEnabledLiveInstances(), new CurrentStateOutput(), _algorithm); Assert.fail("Rebalance shall fail."); } catch (HelixRebalanceException ex) { Assert.assertEquals(ex.getFailureType(), HelixRebalanceException.Type.FAILED_TO_CALCULATE); @@ -439,7 +439,7 @@ public void testAlgorithmException() // Calculation will fail try { rebalancer.computeBestPossibleAssignment(clusterData, resourceMap, - clusterData.getAssignableEnabledLiveInstances(), new CurrentStateOutput(), badAlgorithm); + clusterData.getEnabledLiveInstances(), new CurrentStateOutput(), badAlgorithm); Assert.fail("Rebalance shall fail."); } catch (HelixRebalanceException ex) { Assert.assertEquals(ex.getFailureType(), HelixRebalanceException.Type.FAILED_TO_CALCULATE); @@ -749,8 +749,8 @@ public void testRebalanceOverwrite() throws HelixRebalanceException, IOException Set instances = new HashSet<>(_instances); instances.add(offlineInstance); when(clusterData.getAssignableInstances()).thenReturn(instances); - when(clusterData.getAssignableEnabledInstances()).thenReturn(instances); - when(clusterData.getAssignableEnabledLiveInstances()).thenReturn( + when(clusterData.getEnabledInstances()).thenReturn(instances); + when(clusterData.getEnabledLiveInstances()).thenReturn( new HashSet<>(Arrays.asList(instance0, instance1, instance2))); Map instanceOfflineTimeMap = new HashMap<>(); instanceOfflineTimeMap.put(offlineInstance, System.currentTimeMillis() + Integer.MAX_VALUE); @@ -894,8 +894,8 @@ public void testInstanceCapacityProvider() throws IOException, HelixRebalanceExc // force create a fake offlineInstance that's in delay window Set instances = new HashSet<>(_instances); when(clusterData.getAssignableInstances()).thenReturn(instances); - when(clusterData.getAssignableEnabledInstances()).thenReturn(instances); - when(clusterData.getAssignableEnabledLiveInstances()).thenReturn(instances); + when(clusterData.getEnabledInstances()).thenReturn(instances); + when(clusterData.getEnabledLiveInstances()).thenReturn(instances); Map instanceConfigMap = clusterData.getAssignableInstanceConfigMap(); when(clusterData.getAssignableInstanceConfigMap()).thenReturn(instanceConfigMap); when(clusterData.getInstanceConfigMap()).thenReturn(instanceConfigMap); diff --git a/helix-core/src/test/java/org/apache/helix/controller/rebalancer/waged/TestWagedRebalancerMetrics.java b/helix-core/src/test/java/org/apache/helix/controller/rebalancer/waged/TestWagedRebalancerMetrics.java index c5c7b560c6..3fb05e5f8b 100644 --- a/helix-core/src/test/java/org/apache/helix/controller/rebalancer/waged/TestWagedRebalancerMetrics.java +++ b/helix-core/src/test/java/org/apache/helix/controller/rebalancer/waged/TestWagedRebalancerMetrics.java @@ -266,9 +266,9 @@ protected ResourceControllerDataProvider setupClusterDataCache() throws IOExcept liveInstanceMap.put(instanceName, testLiveInstance); when(testCache.getAssignableLiveInstances()).thenReturn(liveInstanceMap); when(testCache.getLiveInstances()).thenReturn(liveInstanceMap); - when(testCache.getAssignableEnabledInstances()).thenReturn(liveInstanceMap.keySet()); when(testCache.getEnabledInstances()).thenReturn(liveInstanceMap.keySet()); - when(testCache.getAssignableEnabledLiveInstances()).thenReturn(liveInstanceMap.keySet()); + when(testCache.getEnabledInstances()).thenReturn(liveInstanceMap.keySet()); + when(testCache.getEnabledLiveInstances()).thenReturn(liveInstanceMap.keySet()); when(testCache.getEnabledLiveInstances()).thenReturn(liveInstanceMap.keySet()); when(testCache.getAssignableInstances()).thenReturn(_instances); when(testCache.getAllInstances()).thenReturn(_instances); diff --git a/helix-core/src/test/java/org/apache/helix/controller/rebalancer/waged/model/AbstractTestClusterModel.java b/helix-core/src/test/java/org/apache/helix/controller/rebalancer/waged/model/AbstractTestClusterModel.java index c9deb792de..b75a340933 100644 --- a/helix-core/src/test/java/org/apache/helix/controller/rebalancer/waged/model/AbstractTestClusterModel.java +++ b/helix-core/src/test/java/org/apache/helix/controller/rebalancer/waged/model/AbstractTestClusterModel.java @@ -29,6 +29,7 @@ import java.util.Set; import java.util.stream.Collectors; +import org.apache.helix.constants.InstanceConstants; import org.apache.helix.controller.dataproviders.ResourceControllerDataProvider; import org.apache.helix.controller.rebalancer.constraint.MonitoredAbnormalResolver; import org.apache.helix.model.BuiltInStateModelDefinitions; @@ -84,7 +85,7 @@ protected InstanceConfig createMockInstanceConfig(String instanceId) { InstanceConfig testInstanceConfig = new InstanceConfig(instanceId); testInstanceConfig.setInstanceCapacityMap(_capacityDataMap); testInstanceConfig.addTag(_testInstanceTags.get(0)); - testInstanceConfig.setInstanceEnabled(true); + testInstanceConfig.setInstanceOperation(InstanceConstants.InstanceOperation.ENABLE); testInstanceConfig.setZoneId(_testFaultZoneId); return testInstanceConfig; } diff --git a/helix-core/src/test/java/org/apache/helix/controller/rebalancer/waged/model/TestClusterModelProvider.java b/helix-core/src/test/java/org/apache/helix/controller/rebalancer/waged/model/TestClusterModelProvider.java index 34582d600d..2e41b6dbea 100644 --- a/helix-core/src/test/java/org/apache/helix/controller/rebalancer/waged/model/TestClusterModelProvider.java +++ b/helix-core/src/test/java/org/apache/helix/controller/rebalancer/waged/model/TestClusterModelProvider.java @@ -106,7 +106,7 @@ public void testFindToBeAssignedReplicasForMinActiveReplica() throws IOException activeInstances.add(instance1); activeInstances.add(instance2); when(testCache.getAssignableLiveInstances()).thenReturn(liveInstanceMap); - when(testCache.getAssignableEnabledLiveInstances()).thenReturn(activeInstances); + when(testCache.getEnabledLiveInstances()).thenReturn(activeInstances); // test 0, empty input Assert.assertEquals( @@ -144,7 +144,7 @@ public void testFindToBeAssignedReplicasForMinActiveReplica() throws IOException // test 2, no additional replica to be assigned testCache = setupClusterDataCache(); when(testCache.getAssignableLiveInstances()).thenReturn(liveInstanceMap); - when(testCache.getAssignableEnabledLiveInstances()).thenReturn(activeInstances); + when(testCache.getEnabledLiveInstances()).thenReturn(activeInstances); input = ImmutableMap.of( _resourceNames.get(0), ImmutableMap.of( @@ -169,7 +169,7 @@ public void testFindToBeAssignedReplicasForMinActiveReplica() throws IOException // test 3, minActiveReplica==2, two partitions falling short testCache = setupClusterDataCache(); when(testCache.getAssignableLiveInstances()).thenReturn(liveInstanceMap); - when(testCache.getAssignableEnabledLiveInstances()).thenReturn(activeInstances); + when(testCache.getEnabledLiveInstances()).thenReturn(activeInstances); input = ImmutableMap.of( _resourceNames.get(0), ImmutableMap.of( @@ -207,7 +207,7 @@ public void testClusterModelForDelayedRebalanceOverwrite() throws IOException { activeInstances.add(instance1); activeInstances.add(instance2); when(testCache.getAssignableLiveInstances()).thenReturn(liveInstanceMap); - when(testCache.getAssignableEnabledLiveInstances()).thenReturn(activeInstances); + when(testCache.getEnabledLiveInstances()).thenReturn(activeInstances); // test 1, one partition under minActiveReplica Map>> input = ImmutableMap.of( @@ -247,7 +247,7 @@ public void testClusterModelForDelayedRebalanceOverwrite() throws IOException { // test 2, minActiveReplica==2, three partitions falling short testCache = setupClusterDataCache(); when(testCache.getAssignableLiveInstances()).thenReturn(liveInstanceMap); - when(testCache.getAssignableEnabledLiveInstances()).thenReturn(activeInstances); + when(testCache.getEnabledLiveInstances()).thenReturn(activeInstances); input = ImmutableMap.of( _resourceNames.get(0), ImmutableMap.of( diff --git a/helix-core/src/test/java/org/apache/helix/controller/stages/TestBestPossibleCalcStageCompatibility.java b/helix-core/src/test/java/org/apache/helix/controller/stages/TestBestPossibleCalcStageCompatibility.java index 0027f8e4ef..2a548ce457 100644 --- a/helix-core/src/test/java/org/apache/helix/controller/stages/TestBestPossibleCalcStageCompatibility.java +++ b/helix-core/src/test/java/org/apache/helix/controller/stages/TestBestPossibleCalcStageCompatibility.java @@ -58,6 +58,7 @@ public void testSemiAutoModeCompatibility() { event.addAttribute(AttributeName.RESOURCES.name(), resourceMap); event.addAttribute(AttributeName.RESOURCES_TO_REBALANCE.name(), resourceMap); event.addAttribute(AttributeName.CURRENT_STATE.name(), currentStateOutput); + event.addAttribute(AttributeName.CURRENT_STATE_EXCLUDING_UNKNOWN.name(), currentStateOutput); event.addAttribute(AttributeName.ControllerDataProvider.name(), new ResourceControllerDataProvider()); @@ -94,6 +95,7 @@ public void testCustomModeCompatibility() { event.addAttribute(AttributeName.RESOURCES.name(), resourceMap); event.addAttribute(AttributeName.RESOURCES_TO_REBALANCE.name(), resourceMap); event.addAttribute(AttributeName.CURRENT_STATE.name(), currentStateOutput); + event.addAttribute(AttributeName.CURRENT_STATE_EXCLUDING_UNKNOWN.name(), currentStateOutput); event.addAttribute(AttributeName.ControllerDataProvider.name(), new ResourceControllerDataProvider()); diff --git a/helix-core/src/test/java/org/apache/helix/controller/stages/TestBestPossibleStateCalcStage.java b/helix-core/src/test/java/org/apache/helix/controller/stages/TestBestPossibleStateCalcStage.java index e33dc9f5da..518c610be0 100644 --- a/helix-core/src/test/java/org/apache/helix/controller/stages/TestBestPossibleStateCalcStage.java +++ b/helix-core/src/test/java/org/apache/helix/controller/stages/TestBestPossibleStateCalcStage.java @@ -58,6 +58,7 @@ public void testSimple() { event.addAttribute(AttributeName.RESOURCES.name(), resourceMap); event.addAttribute(AttributeName.RESOURCES_TO_REBALANCE.name(), resourceMap); event.addAttribute(AttributeName.CURRENT_STATE.name(), currentStateOutput); + event.addAttribute(AttributeName.CURRENT_STATE_EXCLUDING_UNKNOWN.name(), currentStateOutput); event.addAttribute(AttributeName.ControllerDataProvider.name(), new ResourceControllerDataProvider()); ReadClusterDataStage stage1 = new ReadClusterDataStage(); @@ -117,6 +118,7 @@ public void testAutoEnterMaintenanceWhenExceedingOfflineNodes() { event.addAttribute(AttributeName.RESOURCES.name(), resourceMap); event.addAttribute(AttributeName.RESOURCES_TO_REBALANCE.name(), resourceMap); event.addAttribute(AttributeName.CURRENT_STATE.name(), currentStateOutput); + event.addAttribute(AttributeName.CURRENT_STATE_EXCLUDING_UNKNOWN.name(), currentStateOutput); event.addAttribute(AttributeName.ControllerDataProvider.name(), new ResourceControllerDataProvider()); diff --git a/helix-core/src/test/java/org/apache/helix/controller/stages/TestCancellationMessageGeneration.java b/helix-core/src/test/java/org/apache/helix/controller/stages/TestCancellationMessageGeneration.java index 7b891522cf..7d815c170e 100644 --- a/helix-core/src/test/java/org/apache/helix/controller/stages/TestCancellationMessageGeneration.java +++ b/helix-core/src/test/java/org/apache/helix/controller/stages/TestCancellationMessageGeneration.java @@ -68,6 +68,7 @@ public void TestOFFLINEToDROPPED() throws Exception { when(message.getToState()).thenReturn("SLAVE"); when(currentStateOutput.getPendingMessage(TEST_RESOURCE, partition, TEST_INSTANCE)).thenReturn(message); event.addAttribute(AttributeName.CURRENT_STATE.name(), currentStateOutput); + event.addAttribute(AttributeName.CURRENT_STATE_EXCLUDING_UNKNOWN.name(), currentStateOutput); // Set helix manager to event event.addAttribute(AttributeName.helixmanager.name(), mock(HelixManager.class)); @@ -157,6 +158,7 @@ private List generateMessages(String currentState, String fromState, St when(currentStateOutput.getPendingMessage(TEST_RESOURCE, partition, TEST_INSTANCE)) .thenReturn(pendingMessage); event.addAttribute(AttributeName.CURRENT_STATE.name(), currentStateOutput); + event.addAttribute(AttributeName.CURRENT_STATE_EXCLUDING_UNKNOWN.name(), currentStateOutput); // Set helix manager to event event.addAttribute(AttributeName.helixmanager.name(), mock(HelixManager.class)); diff --git a/helix-core/src/test/java/org/apache/helix/controller/stages/TestIntermediateStateCalcStage.java b/helix-core/src/test/java/org/apache/helix/controller/stages/TestIntermediateStateCalcStage.java index 7da3d64c25..f15e6b87dd 100644 --- a/helix-core/src/test/java/org/apache/helix/controller/stages/TestIntermediateStateCalcStage.java +++ b/helix-core/src/test/java/org/apache/helix/controller/stages/TestIntermediateStateCalcStage.java @@ -177,6 +177,7 @@ public void testNoStateMissing() { event.addAttribute(AttributeName.BEST_POSSIBLE_STATE.name(), bestPossibleStateOutput); event.addAttribute(AttributeName.MESSAGES_SELECTED.name(), messageSelectOutput); event.addAttribute(AttributeName.CURRENT_STATE.name(), currentStateOutput); + event.addAttribute(AttributeName.CURRENT_STATE_EXCLUDING_UNKNOWN.name(), currentStateOutput); event.addAttribute(AttributeName.ControllerDataProvider.name(), new ResourceControllerDataProvider()); runStage(event, new ReadClusterDataStage()); runStage(event, new IntermediateStateCalcStage()); @@ -261,6 +262,7 @@ public void testWithClusterConfigChange() { event.addAttribute(AttributeName.BEST_POSSIBLE_STATE.name(), bestPossibleStateOutput); event.addAttribute(AttributeName.CURRENT_STATE.name(), currentStateOutput); + event.addAttribute(AttributeName.CURRENT_STATE_EXCLUDING_UNKNOWN.name(), currentStateOutput); event.addAttribute(AttributeName.MESSAGES_SELECTED.name(), messageSelectOutput); event.addAttribute(AttributeName.ControllerDataProvider.name(), new ResourceControllerDataProvider()); runStage(event, new ReadClusterDataStage()); @@ -379,6 +381,7 @@ public void testThrottleByErrorPartition() { event.addAttribute(AttributeName.BEST_POSSIBLE_STATE.name(), bestPossibleStateOutput); event.addAttribute(AttributeName.CURRENT_STATE.name(), currentStateOutput); + event.addAttribute(AttributeName.CURRENT_STATE_EXCLUDING_UNKNOWN.name(), currentStateOutput); event.addAttribute(AttributeName.MESSAGES_SELECTED.name(), messageSelectOutput); event.addAttribute(AttributeName.ControllerDataProvider.name(), new ResourceControllerDataProvider()); @@ -553,6 +556,7 @@ public void testPartitionMissing() { event.addAttribute(AttributeName.BEST_POSSIBLE_STATE.name(), bestPossibleStateOutput); event.addAttribute(AttributeName.MESSAGES_SELECTED.name(), messageSelectOutput); event.addAttribute(AttributeName.CURRENT_STATE.name(), currentStateOutput); + event.addAttribute(AttributeName.CURRENT_STATE_EXCLUDING_UNKNOWN.name(), currentStateOutput); event.addAttribute(AttributeName.ControllerDataProvider.name(), new ResourceControllerDataProvider()); runStage(event, new ReadClusterDataStage()); runStage(event, new IntermediateStateCalcStage()); diff --git a/helix-core/src/test/java/org/apache/helix/controller/stages/TestManagementMessageGeneration.java b/helix-core/src/test/java/org/apache/helix/controller/stages/TestManagementMessageGeneration.java index 7c20b0279f..f1a3da4415 100644 --- a/helix-core/src/test/java/org/apache/helix/controller/stages/TestManagementMessageGeneration.java +++ b/helix-core/src/test/java/org/apache/helix/controller/stages/TestManagementMessageGeneration.java @@ -84,6 +84,7 @@ private List generateMessages(String currentState, String fromState, St when(currentStateOutput.getPendingMessage(TEST_RESOURCE, partition, TEST_INSTANCE)) .thenReturn(pendingMessage); event.addAttribute(AttributeName.CURRENT_STATE.name(), currentStateOutput); + event.addAttribute(AttributeName.CURRENT_STATE_EXCLUDING_UNKNOWN.name(), currentStateOutput); // Set helix manager to event event.addAttribute(AttributeName.helixmanager.name(), mock(HelixManager.class)); diff --git a/helix-core/src/test/java/org/apache/helix/controller/stages/TestManagementModeStage.java b/helix-core/src/test/java/org/apache/helix/controller/stages/TestManagementModeStage.java index 1c6162428b..0c144d8eb3 100644 --- a/helix-core/src/test/java/org/apache/helix/controller/stages/TestManagementModeStage.java +++ b/helix-core/src/test/java/org/apache/helix/controller/stages/TestManagementModeStage.java @@ -51,6 +51,7 @@ public class TestManagementModeStage extends ZkTestBase { public void beforeClass() { _clusterName = "CLUSTER_" + TestHelper.getTestClassName(); _accessor = new ZKHelixDataAccessor(_clusterName, new ZkBaseDataAccessor<>(_gZkClient)); + _gSetupTool.setupTestCluster(_clusterName); _manager = new DummyClusterManager(_clusterName, _accessor); } @@ -65,6 +66,8 @@ public void testClusterFreezeStatus() throws Exception { // ideal state: node0 is MASTER, node1 is SLAVE // replica=2 means 1 master and 1 slave setupIdealState(_clusterName, new int[]{0, 1}, new String[]{"TestDB"}, 1, 2); + _gSetupTool.addInstanceToCluster(_clusterName, "localhost_0"); + _gSetupTool.addInstanceToCluster(_clusterName, "localhost_1"); List liveInstances = setupLiveInstances(_clusterName, new int[]{0, 1}); setupStateModel(_clusterName); @@ -96,7 +99,7 @@ public void testClusterFreezeStatus() throws Exception { ControllerHistory history = _accessor.getProperty(_accessor.keyBuilder().controllerLeaderHistory()); - Assert.assertNull(history); + Assert.assertTrue(history.getMaintenanceHistoryList().isEmpty()); // Mark both live instances to be frozen, then entering freeze mode is complete for (int i = 0; i < 2; i++) { diff --git a/helix-core/src/test/java/org/apache/helix/controller/stages/TestReplicaLevelThrottling.java b/helix-core/src/test/java/org/apache/helix/controller/stages/TestReplicaLevelThrottling.java index e4aeed04f8..515340d7cd 100644 --- a/helix-core/src/test/java/org/apache/helix/controller/stages/TestReplicaLevelThrottling.java +++ b/helix-core/src/test/java/org/apache/helix/controller/stages/TestReplicaLevelThrottling.java @@ -71,7 +71,7 @@ private void prepareCache(Map cacheMap, Mock mock) { when(mock.cache.getClusterConfig()).thenReturn((ClusterConfig) cacheMap.get(CacheKeys.clusterConfig.name())); when(mock.cache.getStateModelDef((String) cacheMap.get(CacheKeys.stateModelName.name()))).thenReturn( (StateModelDefinition) cacheMap.get(CacheKeys.stateModelDef.name())); - when(mock.cache.getAssignableEnabledLiveInstances()).thenReturn(new HashSet<>( + when(mock.cache.getEnabledLiveInstances()).thenReturn(new HashSet<>( ((Map>) cacheMap.get(CacheKeys.preferenceList.name())).values().iterator().next())); when(mock.cache.getLiveInstances()).thenReturn(new HashSet<>( ((Map>) cacheMap.get(CacheKeys.preferenceList.name())).values().iterator().next()).stream() @@ -189,6 +189,7 @@ public List loadTestInputs(String fileName) { } ClusterEvent event = new ClusterEvent(CLUSTER_NAME, ClusterEventType.Unknown); event.addAttribute(AttributeName.CURRENT_STATE.name(), currentStateOutput); // add current states + event.addAttribute(AttributeName.CURRENT_STATE_EXCLUDING_UNKNOWN.name(), currentStateOutput); // add current states event.addAttribute(AttributeName.ControllerDataProvider.name(), buildCache(mock, numReplica, minActiveReplica, stateModelDef, stateModelName, preferenceLists)); event.addAttribute(AttributeName.MESSAGES_SELECTED.name(), messageOutput); diff --git a/helix-core/src/test/java/org/apache/helix/controller/stages/TestStateTransitionPriority.java b/helix-core/src/test/java/org/apache/helix/controller/stages/TestStateTransitionPriority.java index 22c20f7dd2..e457e31cab 100644 --- a/helix-core/src/test/java/org/apache/helix/controller/stages/TestStateTransitionPriority.java +++ b/helix-core/src/test/java/org/apache/helix/controller/stages/TestStateTransitionPriority.java @@ -85,6 +85,7 @@ public void testResourceLevelPriorityForRecoveryBalance( event.addAttribute(AttributeName.BEST_POSSIBLE_STATE.name(), bestPossibleStateOutput); event.addAttribute(AttributeName.MESSAGES_SELECTED.name(), messageSelectOutput); event.addAttribute(AttributeName.CURRENT_STATE.name(), currentStateOutput); + event.addAttribute(AttributeName.CURRENT_STATE_EXCLUDING_UNKNOWN.name(), currentStateOutput); runStage(event, new ReadClusterDataStage()); // Keep update the current state. @@ -133,6 +134,7 @@ public void testResourceLevelPriorityForLoadBalance( event.addAttribute(AttributeName.BEST_POSSIBLE_STATE.name(), bestPossibleStateOutput); event.addAttribute(AttributeName.MESSAGES_SELECTED.name(), messageSelectOutput); event.addAttribute(AttributeName.CURRENT_STATE.name(), currentStateOutput); + event.addAttribute(AttributeName.CURRENT_STATE_EXCLUDING_UNKNOWN.name(), currentStateOutput); event.addAttribute(AttributeName.ControllerDataProvider.name(), new ResourceControllerDataProvider()); runStage(event, new ReadClusterDataStage()); @@ -194,6 +196,7 @@ public void testPartitionLevelPriority(String resourceName, event.addAttribute(AttributeName.MESSAGES_SELECTED.name(), generateMessageMapForPartition(bestPossibleMap, currentStateMap, Collections.emptyList(), resourceName)); event.addAttribute(AttributeName.CURRENT_STATE.name(), currentStateOutput); + event.addAttribute(AttributeName.CURRENT_STATE_EXCLUDING_UNKNOWN.name(), currentStateOutput); event.addAttribute(AttributeName.ControllerDataProvider.name(), new ResourceControllerDataProvider()); runStage(event, new ReadClusterDataStage()); @@ -350,6 +353,7 @@ private void updateCurrentOutput(List resourcePriority, resourcePriority.add(resourceName); currentStateOutput.setCurrentState(resourceName, partition, instanceName, state); event.addAttribute(AttributeName.CURRENT_STATE.name(), currentStateOutput); + event.addAttribute(AttributeName.CURRENT_STATE_EXCLUDING_UNKNOWN.name(), currentStateOutput); } private void updateCurrentStateForPartitionLevelPriority(List partitionPriority, diff --git a/helix-core/src/test/java/org/apache/helix/integration/TestAlertingRebalancerFailure.java b/helix-core/src/test/java/org/apache/helix/integration/TestAlertingRebalancerFailure.java index 2f1dee269f..1b2e54b92c 100644 --- a/helix-core/src/test/java/org/apache/helix/integration/TestAlertingRebalancerFailure.java +++ b/helix-core/src/test/java/org/apache/helix/integration/TestAlertingRebalancerFailure.java @@ -33,6 +33,7 @@ import org.apache.helix.HelixDataAccessor; import org.apache.helix.PropertyKey; import org.apache.helix.TestHelper; +import org.apache.helix.constants.InstanceConstants; import org.apache.helix.controller.rebalancer.strategy.CrushEdRebalanceStrategy; import org.apache.helix.controller.rebalancer.strategy.CrushRebalanceStrategy; import org.apache.helix.integration.common.ZkStandAloneCMTestBase; @@ -297,7 +298,8 @@ private void setDomainId(String instanceName, ConfigAccessor configAccessor) { private void setInstanceEnable(String instanceName, boolean enabled, ConfigAccessor configAccessor) { InstanceConfig instanceConfig = configAccessor.getInstanceConfig(CLUSTER_NAME, instanceName); - instanceConfig.setInstanceEnabled(enabled); + instanceConfig.setInstanceOperation(enabled ? InstanceConstants.InstanceOperation.ENABLE + : InstanceConstants.InstanceOperation.DISABLE); configAccessor.setInstanceConfig(CLUSTER_NAME, instanceName, instanceConfig); } diff --git a/helix-core/src/test/java/org/apache/helix/integration/TestDisableCustomCodeRunner.java b/helix-core/src/test/java/org/apache/helix/integration/TestDisableCustomCodeRunner.java index a5416d4d13..ce3077dd9f 100644 --- a/helix-core/src/test/java/org/apache/helix/integration/TestDisableCustomCodeRunner.java +++ b/helix-core/src/test/java/org/apache/helix/integration/TestDisableCustomCodeRunner.java @@ -32,6 +32,7 @@ import org.apache.helix.PropertyKey; import org.apache.helix.TestHelper; import org.apache.helix.ZkUnitTestBase; +import org.apache.helix.constants.InstanceConstants; import org.apache.helix.integration.manager.ClusterControllerManager; import org.apache.helix.integration.manager.MockParticipantManager; import org.apache.helix.manager.zk.ZKHelixAdmin; @@ -170,7 +171,7 @@ public void test() throws Exception { InstanceConfig instanceConfig = new InstanceConfig(fakeInstanceName); instanceConfig.setHostName("localhost"); instanceConfig.setPort("10000"); - instanceConfig.setInstanceEnabled(true); + instanceConfig.setInstanceOperation(InstanceConstants.InstanceOperation.ENABLE); admin.addInstance(clusterName, instanceConfig); LiveInstance fakeInstance = new LiveInstance(fakeInstanceName); diff --git a/helix-core/src/test/java/org/apache/helix/integration/TestNoThrottleDisabledPartitions.java b/helix-core/src/test/java/org/apache/helix/integration/TestNoThrottleDisabledPartitions.java index 98dd281b5e..9cb248f96f 100644 --- a/helix-core/src/test/java/org/apache/helix/integration/TestNoThrottleDisabledPartitions.java +++ b/helix-core/src/test/java/org/apache/helix/integration/TestNoThrottleDisabledPartitions.java @@ -28,6 +28,7 @@ import org.apache.helix.TestHelper; import org.apache.helix.api.config.StateTransitionThrottleConfig; import org.apache.helix.common.ZkTestBase; +import org.apache.helix.constants.InstanceConstants; import org.apache.helix.integration.manager.ClusterControllerManager; import org.apache.helix.integration.manager.MockParticipantManager; import org.apache.helix.manager.zk.ZKHelixDataAccessor; @@ -74,7 +75,7 @@ public void testDisablingTopStateReplicaByDisablingInstance() throws Exception { // Disable instance 0 so that it will cause a partition to do a load balance PropertyKey key = _accessor.keyBuilder().instanceConfig(_participants[0].getInstanceName()); InstanceConfig instanceConfig = _accessor.getProperty(key); - instanceConfig.setInstanceEnabled(false); + instanceConfig.setInstanceOperation(InstanceConstants.InstanceOperation.DISABLE); _accessor.setProperty(key, instanceConfig); // Resume the controller @@ -234,7 +235,7 @@ public void testNoThrottleOnDisabledInstance() throws Exception { // Disable an instance so that it will not be subject to throttling PropertyKey key = _accessor.keyBuilder().instanceConfig(_participants[0].getInstanceName()); InstanceConfig instanceConfig = _accessor.getProperty(key); - instanceConfig.setInstanceEnabled(false); + instanceConfig.setInstanceOperation(InstanceConstants.InstanceOperation.DISABLE); _accessor.setProperty(key, instanceConfig); // Set the state transition delay so that transitions would be processed slowly diff --git a/helix-core/src/test/java/org/apache/helix/integration/TestSetPartitionsToErrorState.java b/helix-core/src/test/java/org/apache/helix/integration/TestSetPartitionsToErrorState.java new file mode 100644 index 0000000000..5b13703b6f --- /dev/null +++ b/helix-core/src/test/java/org/apache/helix/integration/TestSetPartitionsToErrorState.java @@ -0,0 +1,99 @@ +package org.apache.helix.integration; + +import java.util.Date; +import java.util.HashMap; +import java.util.Map; +import org.apache.helix.TestHelper; +import org.apache.helix.common.ZkTestBase; +import org.apache.helix.integration.manager.ClusterControllerManager; +import org.apache.helix.integration.manager.MockParticipantManager; +import org.apache.helix.tools.ClusterSetup; +import org.apache.helix.tools.ClusterStateVerifier; +import org.testng.Assert; +import org.testng.annotations.Test; + +public class TestSetPartitionsToErrorState extends ZkTestBase { + + @Test() + public void testSetPartitionsToErrorState() throws Exception { + String className = TestHelper.getTestClassName(); + String methodName = TestHelper.getTestMethodName(); + String clusterName = className + "_" + methodName; + final int n = 5; + + System.out.println("START " + clusterName + " at " + new Date(System.currentTimeMillis())); + + TestHelper.setupCluster(clusterName, ZK_ADDR, 12918, // participant port + "localhost", // participant name prefix + "TestDB", // resource name prefix + 1, // resources + 10, // partitions per resource + n, // number of nodes + 3, // replicas + "MasterSlave", true); // do rebalance + + ClusterControllerManager controller = + new ClusterControllerManager(ZK_ADDR, clusterName, "controller_0"); + controller.syncStart(); + + // start mock participants + MockParticipantManager[] participants = new MockParticipantManager[n]; + for (int i = 0; i < n; i++) { + String instanceName = "localhost_" + (12918 + i); + participants[i] = new MockParticipantManager(ZK_ADDR, clusterName, instanceName); + participants[i].syncStart(); + } + + // verify cluster + HashMap> errStateMap = new HashMap<>(); + errStateMap.put("TestDB0", new HashMap<>()); + boolean result = ClusterStateVerifier.verifyByZkCallback( + (new ClusterStateVerifier.BestPossAndExtViewZkVerifier(ZK_ADDR, clusterName, errStateMap))); + Assert.assertTrue(result, "Cluster verification fails"); + + // set a non exist partition to ERROR, should throw exception + try { + String command = "--zkSvr " + ZK_ADDR + " --setPartitionsToError " + clusterName + + " localhost_12918 TestDB0 TestDB0_nonExist"; + ClusterSetup.processCommandLineArgs(command.split("\\s+")); + Assert.fail("Should throw exception on setting a non-exist partition to error"); + } catch (Exception e) { + // OK + } + + // set one partition not in ERROR state to ERROR + String command = "--zkSvr " + ZK_ADDR + " --setPartitionsToError " + clusterName + + " localhost_12918 TestDB0 TestDB0_4"; + ClusterSetup.processCommandLineArgs(command.split("\\s+")); + errStateMap.get("TestDB0").put("TestDB0_4", "localhost_12918"); + result = ClusterStateVerifier.verifyByZkCallback( + (new ClusterStateVerifier.BestPossAndExtViewZkVerifier(ZK_ADDR, clusterName, errStateMap))); + Assert.assertTrue(result, "Cluster verification fails"); + + // set another partition not in ERROR state to ERROR + command = "--zkSvr " + ZK_ADDR + " --setPartitionsToError " + clusterName + + " localhost_12918 TestDB0 TestDB0_7"; + ClusterSetup.processCommandLineArgs(command.split("\\s+")); + errStateMap.get("TestDB0").put("TestDB0_7", "localhost_12918"); + result = ClusterStateVerifier.verifyByZkCallback( + (new ClusterStateVerifier.BestPossAndExtViewZkVerifier(ZK_ADDR, clusterName, errStateMap))); + Assert.assertTrue(result, "Cluster verification fails"); + + // setting a partition already in ERROR state to ERROR - message does not get processed + command = "--zkSvr " + ZK_ADDR + " --setPartitionsToError " + clusterName + + " localhost_12918 TestDB0 TestDB0_7"; + ClusterSetup.processCommandLineArgs(command.split("\\s+")); + result = ClusterStateVerifier.verifyByZkCallback( + (new ClusterStateVerifier.BestPossAndExtViewZkVerifier(ZK_ADDR, clusterName, errStateMap))); + Assert.assertTrue(result, "Cluster verification fails"); + + // clean up + controller.syncStop(); + for (int i = 0; i < 5; i++) { + participants[i].syncStop(); + } + deleteCluster(clusterName); + + System.out.println("END " + clusterName + " at " + new Date(System.currentTimeMillis())); + } +} diff --git a/helix-core/src/test/java/org/apache/helix/integration/controller/TestClusterMaintenanceMode.java b/helix-core/src/test/java/org/apache/helix/integration/controller/TestClusterMaintenanceMode.java index ebfb03e8a0..6654098f8b 100644 --- a/helix-core/src/test/java/org/apache/helix/integration/controller/TestClusterMaintenanceMode.java +++ b/helix-core/src/test/java/org/apache/helix/integration/controller/TestClusterMaintenanceMode.java @@ -280,7 +280,7 @@ public void testMaxPartitionLimit() throws Exception { Assert.assertEquals(maintenanceSignal.getTriggeringEntity(), MaintenanceSignal.TriggeringEntity.CONTROLLER); Assert.assertEquals(maintenanceSignal.getAutoTriggerReason(), - MaintenanceSignal.AutoTriggerReason.MAX_OFFLINE_INSTANCES_EXCEEDED); + MaintenanceSignal.AutoTriggerReason.MAX_INSTANCES_UNABLE_TO_ACCEPT_ONLINE_REPLICAS); // Bring up all instances for (int i = 0; i < 3; i++) { @@ -306,7 +306,7 @@ public void testMaxPartitionLimit() throws Exception { Assert.assertEquals(maintenanceSignal.getTriggeringEntity(), MaintenanceSignal.TriggeringEntity.CONTROLLER); Assert.assertEquals(maintenanceSignal.getAutoTriggerReason(), - MaintenanceSignal.AutoTriggerReason.MAX_OFFLINE_INSTANCES_EXCEEDED); + MaintenanceSignal.AutoTriggerReason.MAX_INSTANCES_UNABLE_TO_ACCEPT_ONLINE_REPLICAS); // Set the cluster config for auto-exiting maintenance mode ClusterConfig clusterConfig = _manager.getConfigAccessor().getClusterConfig(CLUSTER_NAME); diff --git a/helix-core/src/test/java/org/apache/helix/integration/messaging/TestMessageThrottle2.java b/helix-core/src/test/java/org/apache/helix/integration/messaging/TestMessageThrottle2.java index b11e6350e5..a61ffea9ac 100644 --- a/helix-core/src/test/java/org/apache/helix/integration/messaging/TestMessageThrottle2.java +++ b/helix-core/src/test/java/org/apache/helix/integration/messaging/TestMessageThrottle2.java @@ -41,6 +41,7 @@ import org.apache.helix.NotificationContext; import org.apache.helix.PropertyKey.Builder; import org.apache.helix.TestHelper; +import org.apache.helix.constants.InstanceConstants; import org.apache.helix.zookeeper.datamodel.ZNRecord; import org.apache.helix.common.ZkTestBase; import org.apache.helix.controller.HelixControllerMain; @@ -379,7 +380,7 @@ private static void addInstanceConfig(String instanceName) { if (instanceConfig == null) { InstanceConfig config = new InstanceConfig(instanceName); config.setHostName("localhost"); - config.setInstanceEnabled(true); + config.setInstanceOperation(InstanceConstants.InstanceOperation.ENABLE); echo("Adding InstanceConfig:" + config); admin.addInstance(_clusterName, config); } diff --git a/helix-core/src/test/java/org/apache/helix/integration/rebalancer/CrushRebalancers/TestNodeSwap.java b/helix-core/src/test/java/org/apache/helix/integration/rebalancer/CrushRebalancers/TestNodeSwap.java index 3b13868507..fef7ea0b96 100644 --- a/helix-core/src/test/java/org/apache/helix/integration/rebalancer/CrushRebalancers/TestNodeSwap.java +++ b/helix-core/src/test/java/org/apache/helix/integration/rebalancer/CrushRebalancers/TestNodeSwap.java @@ -30,6 +30,7 @@ import org.apache.helix.ConfigAccessor; import org.apache.helix.TestHelper; import org.apache.helix.common.ZkTestBase; +import org.apache.helix.constants.InstanceConstants; import org.apache.helix.controller.rebalancer.strategy.CrushEdRebalanceStrategy; import org.apache.helix.controller.rebalancer.strategy.CrushRebalanceStrategy; import org.apache.helix.controller.rebalancer.strategy.MultiRoundCrushRebalanceStrategy; @@ -170,7 +171,7 @@ public void testNodeSwap(String rebalanceStrategyName, String rebalanceStrategyC final InstanceConfig instanceConfig = _gSetupTool.getClusterManagementTool().getInstanceConfig(CLUSTER_NAME, oldParticipantName); // disable the node first - instanceConfig.setInstanceEnabled(false); + instanceConfig.setInstanceOperation(InstanceConstants.InstanceOperation.DISABLE); _gSetupTool.getClusterManagementTool().setInstanceConfig(CLUSTER_NAME, oldParticipantName, instanceConfig); Assert.assertTrue(_clusterVerifier.verify(10000)); diff --git a/helix-core/src/test/java/org/apache/helix/integration/rebalancer/PartitionMigration/TestExpandCluster.java b/helix-core/src/test/java/org/apache/helix/integration/rebalancer/PartitionMigration/TestExpandCluster.java index 15b66af62d..cd5338ef44 100644 --- a/helix-core/src/test/java/org/apache/helix/integration/rebalancer/PartitionMigration/TestExpandCluster.java +++ b/helix-core/src/test/java/org/apache/helix/integration/rebalancer/PartitionMigration/TestExpandCluster.java @@ -21,6 +21,7 @@ import java.util.Map; +import org.apache.helix.constants.InstanceConstants; import org.apache.helix.integration.manager.MockParticipantManager; import org.apache.helix.model.ClusterConfig; import org.apache.helix.model.IdealState; diff --git a/helix-core/src/test/java/org/apache/helix/integration/rebalancer/TestInstanceOperation.java b/helix-core/src/test/java/org/apache/helix/integration/rebalancer/TestInstanceOperation.java index 1fc3a3e203..67b575f0c0 100644 --- a/helix-core/src/test/java/org/apache/helix/integration/rebalancer/TestInstanceOperation.java +++ b/helix-core/src/test/java/org/apache/helix/integration/rebalancer/TestInstanceOperation.java @@ -99,7 +99,7 @@ public class TestInstanceOperation extends ZkTestBase { List _participantNames = new ArrayList<>(); private Set _allDBs = new HashSet<>(); private ZkHelixClusterVerifier _clusterVerifier; - private ZkHelixClusterVerifier _bestPossibleClusterVerifier; + private BestPossibleExternalViewVerifier _bestPossibleClusterVerifier; private ConfigAccessor _configAccessor; private long _stateModelDelay = 3L; @@ -204,15 +204,15 @@ private void disableTopologyAwareRebalance() { Assert.assertTrue(_clusterVerifier.verifyByPolling()); } - private void removeOfflineOrDisabledOrSwapInInstances() { + private void removeOfflineOrInactiveInstances() { // Remove all instances that are not live, disabled, or in SWAP_IN state. for (int i = 0; i < _participants.size(); i++) { String participantName = _participantNames.get(i); InstanceConfig instanceConfig = _gSetupTool.getClusterManagementTool().getInstanceConfig(CLUSTER_NAME, participantName); if (!_participants.get(i).isConnected() || !instanceConfig.getInstanceEnabled() - || instanceConfig.getInstanceOperation() - .equals(InstanceConstants.InstanceOperation.SWAP_IN.name())) { + || instanceConfig.getInstanceOperation().getOperation() + .equals(InstanceConstants.InstanceOperation.SWAP_IN)) { if (_participants.get(i).isConnected()) { _participants.get(i).syncStop(); } @@ -268,11 +268,6 @@ public void testEvacuate() throws Exception { _gSetupTool.dropResourceFromCluster(CLUSTER_NAME, semiAutoDB); Assert.assertTrue(_clusterVerifier.verifyByPolling()); - // Disable, stop, and drop the instance from the cluster. - _gSetupTool.getClusterManagementTool().enableInstance(CLUSTER_NAME, instanceToEvacuate, false); - _participants.get(0).syncStop(); - removeOfflineOrDisabledOrSwapInInstances(); - // Compare the current ev with the previous one, it should be exactly the same since the baseline should not change // after the instance is dropped. Assert.assertTrue(_clusterVerifier.verifyByPolling()); @@ -284,9 +279,12 @@ public void testRevertEvacuation() throws Exception { System.out.println("START TestInstanceOperation.testRevertEvacuation() at " + new Date(System.currentTimeMillis())); // revert an evacuate instance String instanceToEvacuate = _participants.get(0).getInstanceName(); - _gSetupTool.getClusterManagementTool() - .setInstanceOperation(CLUSTER_NAME, instanceToEvacuate, null); + _gSetupTool.getClusterManagementTool().setInstanceOperation(CLUSTER_NAME, instanceToEvacuate, + InstanceConstants.InstanceOperation.ENABLE); + Assert.assertTrue( + _gSetupTool.getClusterManagementTool().getInstanceConfig(CLUSTER_NAME, instanceToEvacuate) + .getInstanceEnabled()); Assert.assertTrue(_clusterVerifier.verifyByPolling()); // EV should contain all participants, check resources one by one @@ -302,10 +300,12 @@ public void testAddingNodeWithEvacuationTag() throws Exception { System.out.println("START TestInstanceOperation.testAddingNodeWithEvacuationTag() at " + new Date(System.currentTimeMillis())); // first disable and instance, and wait for all replicas to be moved out String mockNewInstance = _participants.get(0).getInstanceName(); + // This is using a deprecated method to ensure that the disabling still takes precedence over the InstanceOperation when being set + // to false. _gSetupTool.getClusterManagementTool() .enableInstance(CLUSTER_NAME, mockNewInstance, false); Assert.assertTrue(_clusterVerifier.verifyByPolling()); - //ev should contain all instances but the disabled one + // ev should contain all instances but the disabled one Map assignment = getEVs(); List currentActiveInstances = _participantNames.stream().filter(n -> !n.equals(mockNewInstance)).collect(Collectors.toList()); @@ -317,10 +317,13 @@ public void testAddingNodeWithEvacuationTag() throws Exception { } // add evacuate tag and enable instance + // Because HELIX_ENABLED is set to false, getInstanceOperation still returns DISABLE _gSetupTool.getClusterManagementTool() .setInstanceOperation(CLUSTER_NAME, mockNewInstance, InstanceConstants.InstanceOperation.EVACUATE); - _gSetupTool.getClusterManagementTool() - .enableInstance(CLUSTER_NAME, mockNewInstance, true); + + // enable instance so InstanceOperation is no longer overriden with DISABLE + _gSetupTool.getClusterManagementTool().enableInstance(CLUSTER_NAME, mockNewInstance, true); + //ev should be the same assignment = getEVs(); currentActiveInstances = @@ -335,7 +338,7 @@ public void testAddingNodeWithEvacuationTag() throws Exception { // now remove operation tag String instanceToEvacuate = _participants.get(0).getInstanceName(); _gSetupTool.getClusterManagementTool() - .setInstanceOperation(CLUSTER_NAME, instanceToEvacuate, null); + .setInstanceOperation(CLUSTER_NAME, instanceToEvacuate, InstanceConstants.InstanceOperation.ENABLE); Assert.assertTrue(_clusterVerifier.verifyByPolling()); @@ -347,84 +350,76 @@ public void testAddingNodeWithEvacuationTag() throws Exception { } } - @Test(expectedExceptions = HelixException.class, dependsOnMethods = "testAddingNodeWithEvacuationTag") + @Test(dependsOnMethods = "testAddingNodeWithEvacuationTag") public void testNodeSwapNoTopologySetup() throws Exception { System.out.println("START TestInstanceOperation.testNodeSwapNoTopologySetup() at " + new Date( System.currentTimeMillis())); - removeOfflineOrDisabledOrSwapInInstances(); + removeOfflineOrInactiveInstances(); - // Set instance's InstanceOperation to SWAP_OUT String instanceToSwapOutName = _participants.get(0).getInstanceName(); - _gSetupTool.getClusterManagementTool().setInstanceOperation(CLUSTER_NAME, instanceToSwapOutName, - InstanceConstants.InstanceOperation.SWAP_OUT); - // Add instance with InstanceOperation set to SWAP_IN - // There should be an error that the logicalId does not have SWAP_OUT instance because, - // helix can't determine what topology key to use to get the logicalId if TOPOLOGY is not set. + // Add instance with InstanceOperation set to SWAP_IN as default + // The instance will be added with UNKNOWN because the logicalId will not match the + // swap out instance since the topology configs are not set. String instanceToSwapInName = PARTICIPANT_PREFIX + "_" + _nextStartPort; InstanceConfig instanceToSwapOutInstanceConfig = _gSetupTool.getClusterManagementTool() .getInstanceConfig(CLUSTER_NAME, instanceToSwapOutName); addParticipant(instanceToSwapInName, instanceToSwapOutInstanceConfig.getLogicalId(LOGICAL_ID), instanceToSwapOutInstanceConfig.getDomainAsMap().get(ZONE), - InstanceConstants.InstanceOperation.SWAP_IN, true, -1); + InstanceConstants.InstanceOperation.SWAP_IN, -1); + + Assert.assertEquals( + _gSetupTool.getClusterManagementTool().getInstanceConfig(CLUSTER_NAME, instanceToSwapInName) + .getInstanceOperation().getOperation(), + InstanceConstants.InstanceOperation.UNKNOWN); } - @Test(expectedExceptions = HelixException.class, dependsOnMethods = "testNodeSwapNoTopologySetup") - public void testAddingNodeWithSwapOutInstanceOperation() throws Exception { + @Test(dependsOnMethods = "testNodeSwapNoTopologySetup") + public void testAddingNodeWithEnableInstanceOperation() throws Exception { System.out.println( - "START TestInstanceOperation.testAddingNodeWithSwapOutInstanceOperation() at " + new Date( + "START TestInstanceOperation.testAddingNodeWithEnableInstanceOperation() at " + new Date( System.currentTimeMillis())); enabledTopologyAwareRebalance(); - removeOfflineOrDisabledOrSwapInInstances(); + removeOfflineOrInactiveInstances(); - // Set instance's InstanceOperation to SWAP_OUT String instanceToSwapOutName = _participants.get(0).getInstanceName(); InstanceConfig instanceToSwapOutInstanceConfig = _gSetupTool.getClusterManagementTool() .getInstanceConfig(CLUSTER_NAME, instanceToSwapOutName); - _gSetupTool.getClusterManagementTool().setInstanceOperation(CLUSTER_NAME, instanceToSwapOutName, - InstanceConstants.InstanceOperation.SWAP_OUT); - // Add instance with InstanceOperation set to SWAP_IN + // Add instance with InstanceOperation set to ENABLE + // The instance should be added with UNKNOWN since there is already an instance with + // the same logicalId in the cluster and this instance is not being set to SWAP_IN when + // added. String instanceToSwapInName = PARTICIPANT_PREFIX + "_" + _nextStartPort; addParticipant(instanceToSwapInName, instanceToSwapOutInstanceConfig.getLogicalId(LOGICAL_ID), instanceToSwapOutInstanceConfig.getDomainAsMap().get(ZONE), - InstanceConstants.InstanceOperation.SWAP_OUT, true, -1); - } - - @Test(expectedExceptions = HelixException.class, dependsOnMethods = "testAddingNodeWithSwapOutInstanceOperation") - public void testAddingNodeWithSwapOutNodeInstanceOperationUnset() throws Exception { - System.out.println( - "START TestInstanceOperation.testAddingNodeWithSwapOutNodeInstanceOperationUnset() at " - + new Date(System.currentTimeMillis())); - - removeOfflineOrDisabledOrSwapInInstances(); - - // Set instance's InstanceOperation to null - String instanceToSwapOutName = _participants.get(0).getInstanceName(); - InstanceConfig instanceToSwapOutInstanceConfig = _gSetupTool.getClusterManagementTool() - .getInstanceConfig(CLUSTER_NAME, instanceToSwapOutName); - _gSetupTool.getClusterManagementTool() - .setInstanceOperation(CLUSTER_NAME, instanceToSwapOutName, null); + InstanceConstants.InstanceOperation.ENABLE, -1); - // Add instance with InstanceOperation set to SWAP_IN - String instanceToSwapInName = PARTICIPANT_PREFIX + "_" + _nextStartPort; - addParticipant(instanceToSwapInName, instanceToSwapOutInstanceConfig.getLogicalId(LOGICAL_ID), - instanceToSwapOutInstanceConfig.getDomainAsMap().get(ZONE), - InstanceConstants.InstanceOperation.SWAP_IN, true, -1); + Assert.assertEquals( + _gSetupTool.getClusterManagementTool().getInstanceConfig(CLUSTER_NAME, instanceToSwapInName) + .getInstanceOperation().getOperation(), + InstanceConstants.InstanceOperation.UNKNOWN); } - @Test(expectedExceptions = HelixException.class, dependsOnMethods = "testAddingNodeWithSwapOutNodeInstanceOperationUnset") + @Test(dependsOnMethods = "testAddingNodeWithEnableInstanceOperation") public void testNodeSwapWithNoSwapOutNode() throws Exception { System.out.println("START TestInstanceOperation.testNodeSwapWithNoSwapOutNode() at " + new Date( System.currentTimeMillis())); - removeOfflineOrDisabledOrSwapInInstances(); + removeOfflineOrInactiveInstances(); // Add new instance with InstanceOperation set to SWAP_IN + // The instance should be added with UNKNOWN since there is not an instance with a matching + // logicalId in the cluster to swap with. String instanceToSwapInName = PARTICIPANT_PREFIX + "_" + _nextStartPort; addParticipant(instanceToSwapInName, "1000", "zone_1000", - InstanceConstants.InstanceOperation.SWAP_IN, true, -1); + InstanceConstants.InstanceOperation.SWAP_IN, -1); + + Assert.assertEquals( + _gSetupTool.getClusterManagementTool().getInstanceConfig(CLUSTER_NAME, instanceToSwapInName) + .getInstanceOperation().getOperation(), + InstanceConstants.InstanceOperation.UNKNOWN); } @Test(dependsOnMethods = "testNodeSwapWithNoSwapOutNode") @@ -433,21 +428,28 @@ public void testNodeSwapSwapInNodeNoInstanceOperationEnabled() throws Exception "START TestInstanceOperation.testNodeSwapSwapInNodeNoInstanceOperationEnabled() at " + new Date(System.currentTimeMillis())); - removeOfflineOrDisabledOrSwapInInstances(); + removeOfflineOrInactiveInstances(); - // Set instance's InstanceOperation to SWAP_OUT String instanceToSwapOutName = _participants.get(0).getInstanceName(); InstanceConfig instanceToSwapOutInstanceConfig = _gSetupTool.getClusterManagementTool() .getInstanceConfig(CLUSTER_NAME, instanceToSwapOutName); - _gSetupTool.getClusterManagementTool().setInstanceOperation(CLUSTER_NAME, instanceToSwapOutName, - InstanceConstants.InstanceOperation.SWAP_OUT); - // Add instance with same logicalId with InstanceOperation unset - // This should work because adding instance with InstanceOperation unset will automatically - // set the InstanceOperation to SWAP_IN. + // Add instance with same logicalId with InstanceOperation unset, this is the same as default + // which is ENABLE. + // The instance should be set to UNKNOWN since there is already a matching logicalId in the cluster. String instanceToSwapInName = PARTICIPANT_PREFIX + "_" + _nextStartPort; addParticipant(instanceToSwapInName, instanceToSwapOutInstanceConfig.getLogicalId(LOGICAL_ID), - instanceToSwapOutInstanceConfig.getDomainAsMap().get(ZONE), null, true, -1); + instanceToSwapOutInstanceConfig.getDomainAsMap().get(ZONE), null, -1); + + Assert.assertEquals( + _gSetupTool.getClusterManagementTool().getInstanceConfig(CLUSTER_NAME, instanceToSwapInName) + .getInstanceOperation().getOperation(), + InstanceConstants.InstanceOperation.UNKNOWN); + + // Setting the InstanceOperation to SWAP_IN should work because there is a matching logicalId in + // the cluster and the InstanceCapacityWeights and FaultZone match. + _gSetupTool.getClusterManagementTool().setInstanceOperation(CLUSTER_NAME, instanceToSwapInName, + InstanceConstants.InstanceOperation.SWAP_IN); Assert.assertTrue(_bestPossibleClusterVerifier.verifyByPolling()); Assert.assertTrue(_gSetupTool.getClusterManagementTool() @@ -461,20 +463,17 @@ public void testNodeSwapSwapInNodeWithAlreadySwappingPair() throws Exception { "START TestInstanceOperation.testNodeSwapSwapInNodeWithAlreadySwappingPair() at " + new Date(System.currentTimeMillis())); - removeOfflineOrDisabledOrSwapInInstances(); + removeOfflineOrInactiveInstances(); - // Set instance's InstanceOperation to SWAP_OUT String instanceToSwapOutName = _participants.get(0).getInstanceName(); InstanceConfig instanceToSwapOutInstanceConfig = _gSetupTool.getClusterManagementTool() .getInstanceConfig(CLUSTER_NAME, instanceToSwapOutName); - _gSetupTool.getClusterManagementTool().setInstanceOperation(CLUSTER_NAME, instanceToSwapOutName, - InstanceConstants.InstanceOperation.SWAP_OUT); // Add instance with InstanceOperation set to SWAP_IN String instanceToSwapInName = PARTICIPANT_PREFIX + "_" + _nextStartPort; addParticipant(instanceToSwapInName, instanceToSwapOutInstanceConfig.getLogicalId(LOGICAL_ID), instanceToSwapOutInstanceConfig.getDomainAsMap().get(ZONE), - InstanceConstants.InstanceOperation.SWAP_IN, true, -1); + InstanceConstants.InstanceOperation.SWAP_IN, -1); // Add another instance with InstanceOperation set to SWAP_IN with same logicalId as previously // added SWAP_IN instance. @@ -482,88 +481,71 @@ public void testNodeSwapSwapInNodeWithAlreadySwappingPair() throws Exception { addParticipant(secondInstanceToSwapInName, instanceToSwapOutInstanceConfig.getLogicalId(LOGICAL_ID), instanceToSwapOutInstanceConfig.getDomainAsMap().get(ZONE), - InstanceConstants.InstanceOperation.SWAP_IN, true, -1); - } - - @Test(expectedExceptions = HelixException.class, dependsOnMethods = "testNodeSwapSwapInNodeWithAlreadySwappingPair") - public void testNodeSwapWrongFaultZone() throws Exception { - System.out.println("START TestInstanceOperation.testNodeSwapWrongFaultZone() at " + new Date( - System.currentTimeMillis())); - removeOfflineOrDisabledOrSwapInInstances(); - - // Set instance's InstanceOperation to SWAP_OUT - String instanceToSwapOutName = _participants.get(0).getInstanceName(); - _gSetupTool.getClusterManagementTool().setInstanceOperation(CLUSTER_NAME, instanceToSwapOutName, - InstanceConstants.InstanceOperation.SWAP_OUT); - - // Add instance with InstanceOperation set to SWAP_IN - // There should be an error because SWAP_IN instance must be in the same FAULT_ZONE as the SWAP_OUT instance. - String instanceToSwapInName = PARTICIPANT_PREFIX + "_" + _nextStartPort; - InstanceConfig instanceToSwapOutInstanceConfig = _gSetupTool.getClusterManagementTool() - .getInstanceConfig(CLUSTER_NAME, instanceToSwapOutName); - addParticipant(instanceToSwapInName, instanceToSwapOutInstanceConfig.getLogicalId(LOGICAL_ID), - instanceToSwapOutInstanceConfig.getDomainAsMap().get(ZONE) + "1", - InstanceConstants.InstanceOperation.SWAP_IN, true, -1); - } - - @Test(expectedExceptions = HelixException.class, dependsOnMethods = "testNodeSwapWrongFaultZone") - public void testNodeSwapWrongCapacity() throws Exception { - System.out.println("START TestInstanceOperation.testNodeSwapWrongCapacity() at " + new Date( - System.currentTimeMillis())); - removeOfflineOrDisabledOrSwapInInstances(); + InstanceConstants.InstanceOperation.SWAP_IN, -1); - // Set instance's InstanceOperation to SWAP_OUT - String instanceToSwapOutName = _participants.get(0).getInstanceName(); - _gSetupTool.getClusterManagementTool().setInstanceOperation(CLUSTER_NAME, instanceToSwapOutName, - InstanceConstants.InstanceOperation.SWAP_OUT); + // Instance should be UNKNOWN since there was already a swapping pair. + Assert.assertEquals(_gSetupTool.getClusterManagementTool() + .getInstanceConfig(CLUSTER_NAME, secondInstanceToSwapInName).getInstanceOperation() + .getOperation(), + InstanceConstants.InstanceOperation.UNKNOWN); - // Add instance with InstanceOperation set to SWAP_IN - // There should be an error because SWAP_IN instance must have same capacity as the SWAP_OUT node. - String instanceToSwapInName = PARTICIPANT_PREFIX + "_" + _nextStartPort; - InstanceConfig instanceToSwapOutInstanceConfig = _gSetupTool.getClusterManagementTool() - .getInstanceConfig(CLUSTER_NAME, instanceToSwapOutName); - addParticipant(instanceToSwapInName, instanceToSwapOutInstanceConfig.getLogicalId(LOGICAL_ID), - instanceToSwapOutInstanceConfig.getDomainAsMap().get(ZONE), - InstanceConstants.InstanceOperation.SWAP_IN, true, TEST_CAPACITY_VALUE - 10); + // Try to set the InstanceOperation to SWAP_IN, it should throw an exception since there is already + // a swapping pair. + _gSetupTool.getClusterManagementTool() + .setInstanceOperation(CLUSTER_NAME, secondInstanceToSwapInName, + InstanceConstants.InstanceOperation.SWAP_IN); } - @Test(dependsOnMethods = "testNodeSwapWrongCapacity") + @Test(dependsOnMethods = "testNodeSwapSwapInNodeWithAlreadySwappingPair") public void testNodeSwap() throws Exception { System.out.println( "START TestInstanceOperation.testNodeSwap() at " + new Date(System.currentTimeMillis())); - removeOfflineOrDisabledOrSwapInInstances(); - // Store original EV - Map originalEVs = getEVs(); + removeOfflineOrInactiveInstances(); Map swapOutInstancesToSwapInInstances = new HashMap<>(); - // Set instance's InstanceOperation to SWAP_OUT String instanceToSwapOutName = _participants.get(0).getInstanceName(); InstanceConfig instanceToSwapOutInstanceConfig = _gSetupTool.getClusterManagementTool() .getInstanceConfig(CLUSTER_NAME, instanceToSwapOutName); - _gSetupTool.getClusterManagementTool().setInstanceOperation(CLUSTER_NAME, instanceToSwapOutName, - InstanceConstants.InstanceOperation.SWAP_OUT); - // Validate that the assignment has not changed since setting the InstanceOperation to SWAP_OUT Assert.assertTrue(_clusterVerifier.verifyByPolling()); + + String resourceToDisablePartition = _allDBs.iterator().next(); + // Disable 1 partition that is assigned to the instance that will be swapped out. + getPartitionsAndStatesOnInstance(getEVs(), instanceToSwapOutName).entrySet().stream() + .filter(entry -> entry.getKey().startsWith(resourceToDisablePartition)).findFirst() + .ifPresent(entry -> { + String partition = entry.getKey(); + instanceToSwapOutInstanceConfig.setInstanceEnabledForPartition(resourceToDisablePartition, + partition, false); + }); + _gSetupTool.getClusterManagementTool() + .setInstanceConfig(CLUSTER_NAME, instanceToSwapOutName, instanceToSwapOutInstanceConfig); + + Assert.assertTrue(_clusterVerifier.verifyByPolling()); + + // Store original EV + Map originalEVs = getEVs(); + validateEVsCorrect(getEVs(), originalEVs, swapOutInstancesToSwapInInstances, Collections.emptySet(), Collections.emptySet()); + // Create a custom change listener to check if the throttles are enabled after the swap is completed. CustomIndividualInstanceConfigChangeListener instanceToSwapInInstanceConfigListener = new CustomIndividualInstanceConfigChangeListener(); + // Add instance with InstanceOperation set to SWAP_IN String instanceToSwapInName = PARTICIPANT_PREFIX + "_" + _nextStartPort; swapOutInstancesToSwapInInstances.put(instanceToSwapOutName, instanceToSwapInName); addParticipant(instanceToSwapInName, instanceToSwapOutInstanceConfig.getLogicalId(LOGICAL_ID), instanceToSwapOutInstanceConfig.getDomainAsMap().get(ZONE), - InstanceConstants.InstanceOperation.SWAP_IN, true, -1, instanceToSwapInInstanceConfigListener); + InstanceConstants.InstanceOperation.SWAP_IN, -1, instanceToSwapInInstanceConfigListener); + // Validate that the throttles are off since the InstanceOperation is set to SWAP_IN Assert.assertFalse(instanceToSwapInInstanceConfigListener.isThrottlesEnabled()); - // Validate that partitions on SWAP_OUT instance does not change after setting the InstanceOperation to SWAP_OUT - // and adding the SWAP_IN instance to the cluster. - // Check that the SWAP_IN instance has the same partitions as the SWAP_OUT instance + // Check that the SWAP_IN instance has the same partitions as the swap out instance // but none of them are in a top state. Assert.assertTrue(_bestPossibleClusterVerifier.verifyByPolling()); validateEVsCorrect(getEVs(), originalEVs, swapOutInstancesToSwapInInstances, @@ -573,7 +555,7 @@ public void testNodeSwap() throws Exception { Assert.assertTrue(_gSetupTool.getClusterManagementTool() .canCompleteSwap(CLUSTER_NAME, instanceToSwapOutName)); - // Validate that the SWAP_OUT instance is in routing tables and SWAP_IN is not. + // Validate that the swap out instance is in routing tables and SWAP_IN is not. validateRoutingTablesInstance(getEVs(), instanceToSwapOutName, true); validateRoutingTablesInstance(getEVs(), instanceToSwapInName, false); @@ -581,20 +563,32 @@ public void testNodeSwap() throws Exception { Assert.assertTrue(_gSetupTool.getClusterManagementTool() .completeSwapIfPossible(CLUSTER_NAME, instanceToSwapOutName, false)); + // Get both instanceConfigs and make sure correct fields are copied over. + InstanceConfig instanceToSwapInInstanceConfig = _gSetupTool.getClusterManagementTool() + .getInstanceConfig(CLUSTER_NAME, instanceToSwapInName); + + Assert.assertEquals(instanceToSwapInInstanceConfig.getRecord() + .getMapField(InstanceConfig.InstanceConfigProperty.HELIX_DISABLED_PARTITION.name()), + instanceToSwapInInstanceConfig.getRecord() + .getMapField(InstanceConfig.InstanceConfigProperty.HELIX_DISABLED_PARTITION.name())); + Assert.assertTrue(_clusterVerifier.verifyByPolling()); // Validate that the SWAP_IN instance is now in the routing tables. validateRoutingTablesInstance(getEVs(), instanceToSwapInName, true); - - // Assert that SWAP_OUT instance is disabled and has no partitions assigned to it. + // Assert that swap out instance is not active and has no partitions assigned to it. Assert.assertFalse(_gSetupTool.getClusterManagementTool() .getInstanceConfig(CLUSTER_NAME, instanceToSwapOutName).getInstanceEnabled()); + Assert.assertEquals(_gSetupTool.getClusterManagementTool() + .getInstanceConfig(CLUSTER_NAME, instanceToSwapOutName).getInstanceOperation() + .getOperation(), + InstanceConstants.InstanceOperation.UNKNOWN); // Check to make sure the throttle was enabled again after the swap was completed. Assert.assertTrue(instanceToSwapInInstanceConfigListener.isThrottlesEnabled()); - // Validate that the SWAP_IN instance has the same partitions the SWAP_OUT instance had before + // Validate that the SWAP_IN instance has the same partitions the swap out instance had before // swap was completed. verifier(() -> (validateEVsCorrect(getEVs(), originalEVs, swapOutInstancesToSwapInInstances, Collections.emptySet(), ImmutableSet.of(instanceToSwapInName))), TIMEOUT); @@ -604,21 +598,18 @@ public void testNodeSwap() throws Exception { public void testNodeSwapDisableAndReenable() throws Exception { System.out.println( "START TestInstanceOperation.testNodeSwap() at " + new Date(System.currentTimeMillis())); - removeOfflineOrDisabledOrSwapInInstances(); + removeOfflineOrInactiveInstances(); // Store original EV Map originalEVs = getEVs(); Map swapOutInstancesToSwapInInstances = new HashMap<>(); - // Set instance's InstanceOperation to SWAP_OUT String instanceToSwapOutName = _participants.get(0).getInstanceName(); InstanceConfig instanceToSwapOutInstanceConfig = _gSetupTool.getClusterManagementTool() .getInstanceConfig(CLUSTER_NAME, instanceToSwapOutName); - _gSetupTool.getClusterManagementTool().setInstanceOperation(CLUSTER_NAME, instanceToSwapOutName, - InstanceConstants.InstanceOperation.SWAP_OUT); - // Validate that the assignment has not changed since setting the InstanceOperation to SWAP_OUT + // Validate that the assignment has not changed since setting the InstanceOperation to swap out Assert.assertTrue(_clusterVerifier.verifyByPolling()); validateEVsCorrect(getEVs(), originalEVs, swapOutInstancesToSwapInInstances, Collections.emptySet(), Collections.emptySet()); @@ -628,11 +619,9 @@ public void testNodeSwapDisableAndReenable() throws Exception { swapOutInstancesToSwapInInstances.put(instanceToSwapOutName, instanceToSwapInName); addParticipant(instanceToSwapInName, instanceToSwapOutInstanceConfig.getLogicalId(LOGICAL_ID), instanceToSwapOutInstanceConfig.getDomainAsMap().get(ZONE), - InstanceConstants.InstanceOperation.SWAP_IN, true, -1); + InstanceConstants.InstanceOperation.SWAP_IN, -1); - // Validate that partitions on SWAP_OUT instance does not change after setting the InstanceOperation to SWAP_OUT - // and adding the SWAP_IN instance to the cluster. - // Check that the SWAP_IN instance has the same partitions as the SWAP_OUT instance + // Check that the SWAP_IN instance has the same partitions as the swap out instance // but none of them are in a top state. Assert.assertTrue(_bestPossibleClusterVerifier.verifyByPolling()); validateEVsCorrect(getEVs(), originalEVs, swapOutInstancesToSwapInInstances, @@ -642,12 +631,12 @@ public void testNodeSwapDisableAndReenable() throws Exception { Assert.assertTrue(_gSetupTool.getClusterManagementTool() .canCompleteSwap(CLUSTER_NAME, instanceToSwapOutName)); - // Disable the SWAP_IN instance + // Try to disable the swap out instance, it should not do anything. _gSetupTool.getClusterManagementTool() - .enableInstance(CLUSTER_NAME, instanceToSwapInName, false); + .enableInstance(CLUSTER_NAME, instanceToSwapOutName, false); // Check that the SWAP_IN instance's replicas match the SWAP_OUT instance's replicas - // but all of them are OFFLINE + // and all of them are OFFLINE. Assert.assertTrue(_bestPossibleClusterVerifier.verifyByPolling()); Map> resourcePartitionStateOnSwapOutInstance = getResourcePartitionStateOnInstance(getEVs(), instanceToSwapOutName); @@ -658,14 +647,27 @@ public void testNodeSwapDisableAndReenable() throws Exception { .collect(Collectors.toSet()), resourcePartitionStateOnSwapOutInstance.values().stream().flatMap(p -> p.keySet().stream()) .collect(Collectors.toSet())); + Set swapOutInstancePartitionStates = + resourcePartitionStateOnSwapOutInstance.values().stream().flatMap(e -> e.values().stream()) + .collect(Collectors.toSet()); + Assert.assertEquals(swapOutInstancePartitionStates.size(), 1); + Assert.assertTrue(swapOutInstancePartitionStates.contains("OFFLINE")); Set swapInInstancePartitionStates = resourcePartitionStateOnSwapInInstance.values().stream().flatMap(e -> e.values().stream()) .collect(Collectors.toSet()); Assert.assertEquals(swapInInstancePartitionStates.size(), 1); Assert.assertTrue(swapInInstancePartitionStates.contains("OFFLINE")); - // Re-enable the SWAP_IN instance - _gSetupTool.getClusterManagementTool().enableInstance(CLUSTER_NAME, instanceToSwapInName, true); + // Validate that the SWAP_OUT instance is in routing tables and SWAP_IN is not. + validateRoutingTablesInstance(getEVs(), instanceToSwapOutName, true); + validateRoutingTablesInstance(getEVs(), instanceToSwapInName, false); + + // Assert canSwapBeCompleted is true + Assert.assertTrue(_gSetupTool.getClusterManagementTool() + .canCompleteSwap(CLUSTER_NAME, instanceToSwapOutName)); + + // Re-enable the swap out instance + _gSetupTool.getClusterManagementTool().enableInstance(CLUSTER_NAME, instanceToSwapOutName, true); Assert.assertTrue(_bestPossibleClusterVerifier.verifyByPolling()); // Validate that the SWAP_OUT instance is in routing tables and SWAP_IN is not. @@ -681,63 +683,62 @@ public void testNodeSwapDisableAndReenable() throws Exception { // Validate that the SWAP_IN instance is now in the routing tables. validateRoutingTablesInstance(getEVs(), instanceToSwapInName, true); - // Assert that SWAP_OUT instance is disabled and has no partitions assigned to it. + // Assert that swap out instance is not active and has no partitions assigned to it. Assert.assertFalse(_gSetupTool.getClusterManagementTool() .getInstanceConfig(CLUSTER_NAME, instanceToSwapOutName).getInstanceEnabled()); + Assert.assertEquals(_gSetupTool.getClusterManagementTool() + .getInstanceConfig(CLUSTER_NAME, instanceToSwapOutName).getInstanceOperation() + .getOperation(), + InstanceConstants.InstanceOperation.UNKNOWN); - // Validate that the SWAP_IN instance has the same partitions the SWAP_OUT instance had before + // Validate that the SWAP_IN instance has the same partitions the swap out instance had before // swap was completed. verifier(() -> (validateEVsCorrect(getEVs(), originalEVs, swapOutInstancesToSwapInInstances, Collections.emptySet(), ImmutableSet.of(instanceToSwapInName))), TIMEOUT); } @Test(dependsOnMethods = "testNodeSwapDisableAndReenable") - public void testNodeSwapSwapInNodeNoInstanceOperationDisabled() throws Exception { - System.out.println( - "START TestInstanceOperation.testNodeSwapSwapInNodeNoInstanceOperationDisabled() at " + public void testNodeSwapSwapInNodeNoInstanceOperation() throws Exception { + System.out.println("START TestInstanceOperation.testNodeSwapSwapInNodeNoInstanceOperation() at " + new Date(System.currentTimeMillis())); - removeOfflineOrDisabledOrSwapInInstances(); + removeOfflineOrInactiveInstances(); // Store original EVs Map originalEVs = getEVs(); Map swapOutInstancesToSwapInInstances = new HashMap<>(); - // Set instance's InstanceOperation to SWAP_OUT String instanceToSwapOutName = _participants.get(0).getInstanceName(); InstanceConfig instanceToSwapOutInstanceConfig = _gSetupTool.getClusterManagementTool() .getInstanceConfig(CLUSTER_NAME, instanceToSwapOutName); - _gSetupTool.getClusterManagementTool().setInstanceOperation(CLUSTER_NAME, instanceToSwapOutName, - InstanceConstants.InstanceOperation.SWAP_OUT); - // Validate that the assignment has not changed since setting the InstanceOperation to SWAP_OUT Assert.assertTrue(_clusterVerifier.verifyByPolling()); validateEVsCorrect(getEVs(), originalEVs, swapOutInstancesToSwapInInstances, Collections.emptySet(), Collections.emptySet()); - // Add instance with InstanceOperation unset, should automatically be set to SWAP_IN + // Add instance with InstanceOperation unset, should set to UNKNOWN. String instanceToSwapInName = PARTICIPANT_PREFIX + "_" + _nextStartPort; swapOutInstancesToSwapInInstances.put(instanceToSwapOutName, instanceToSwapInName); addParticipant(instanceToSwapInName, instanceToSwapOutInstanceConfig.getLogicalId(LOGICAL_ID), - instanceToSwapOutInstanceConfig.getDomainAsMap().get(ZONE), null, false, -1); + instanceToSwapOutInstanceConfig.getDomainAsMap().get(ZONE), null, -1); + // Validate that the SWAP_IN instance does not have any partitions on it. Assert.assertTrue(_clusterVerifier.verifyByPolling()); validateEVsCorrect(getEVs(), originalEVs, swapOutInstancesToSwapInInstances, Collections.emptySet(), Collections.emptySet()); - // Enable the SWAP_IN instance, so it can start being assigned replicas - _gSetupTool.getClusterManagementTool().enableInstance(CLUSTER_NAME, instanceToSwapInName, true); + // Set InstanceOperation to SWAP_IN + _gSetupTool.getClusterManagementTool().setInstanceOperation(CLUSTER_NAME, instanceToSwapInName, + InstanceConstants.InstanceOperation.SWAP_IN); - // Validate that partitions on SWAP_OUT instance does not change after setting the InstanceOperation to SWAP_OUT - // and adding the SWAP_IN instance to the cluster. - // Check that the SWAP_IN instance has the same partitions as the SWAP_OUT instance + // Check that the SWAP_IN instance has the same partitions as the swap out instance // but none of them are in a top state. Assert.assertTrue(_bestPossibleClusterVerifier.verifyByPolling()); validateEVsCorrect(getEVs(), originalEVs, swapOutInstancesToSwapInInstances, ImmutableSet.of(instanceToSwapInName), Collections.emptySet()); - // Validate that the SWAP_OUT instance is in routing tables and SWAP_IN is not. + // Validate that the swap out instance is in routing tables and SWAP_IN is not. validateRoutingTablesInstance(getEVs(), instanceToSwapOutName, true); validateRoutingTablesInstance(getEVs(), instanceToSwapInName, false); @@ -750,37 +751,34 @@ public void testNodeSwapSwapInNodeNoInstanceOperationDisabled() throws Exception Assert.assertTrue(_clusterVerifier.verifyByPolling()); - // Assert that SWAP_OUT instance is disabled and has no partitions assigned to it. + // Assert that swap out instance is inactive and has no partitions assigned to it. Assert.assertFalse(_gSetupTool.getClusterManagementTool() .getInstanceConfig(CLUSTER_NAME, instanceToSwapOutName).getInstanceEnabled()); - // Validate that the SWAP_IN instance has the same partitions the SWAP_OUT instance had before + // Validate that the SWAP_IN instance has the same partitions the swap out instance had before // swap was completed. verifier(() -> (validateEVsCorrect(getEVs(), originalEVs, swapOutInstancesToSwapInInstances, Collections.emptySet(), ImmutableSet.of(instanceToSwapInName))), TIMEOUT); } - @Test(dependsOnMethods = "testNodeSwapSwapInNodeNoInstanceOperationDisabled") + @Test(dependsOnMethods = "testNodeSwapSwapInNodeNoInstanceOperation") public void testNodeSwapCancelSwapWhenReadyToComplete() throws Exception { System.out.println( "START TestInstanceOperation.testNodeSwapCancelSwapWhenReadyToComplete() at " + new Date( System.currentTimeMillis())); - removeOfflineOrDisabledOrSwapInInstances(); + removeOfflineOrInactiveInstances(); // Store original EVs Map originalEVs = getEVs(); Map swapOutInstancesToSwapInInstances = new HashMap<>(); - // Set instance's InstanceOperation to SWAP_OUT String instanceToSwapOutName = _participants.get(0).getInstanceName(); InstanceConfig instanceToSwapOutInstanceConfig = _gSetupTool.getClusterManagementTool() .getInstanceConfig(CLUSTER_NAME, instanceToSwapOutName); - _gSetupTool.getClusterManagementTool().setInstanceOperation(CLUSTER_NAME, instanceToSwapOutName, - InstanceConstants.InstanceOperation.SWAP_OUT); - // Validate that the assignment has not changed since setting the InstanceOperation to SWAP_OUT + // Validate that the assignment has not changed since setting the InstanceOperation to swap out Assert.assertTrue(_clusterVerifier.verifyByPolling()); validateEVsCorrect(getEVs(), originalEVs, swapOutInstancesToSwapInInstances, Collections.emptySet(), Collections.emptySet()); @@ -790,17 +788,15 @@ public void testNodeSwapCancelSwapWhenReadyToComplete() throws Exception { swapOutInstancesToSwapInInstances.put(instanceToSwapOutName, instanceToSwapInName); addParticipant(instanceToSwapInName, instanceToSwapOutInstanceConfig.getLogicalId(LOGICAL_ID), instanceToSwapOutInstanceConfig.getDomainAsMap().get(ZONE), - InstanceConstants.InstanceOperation.SWAP_IN, true, -1); + InstanceConstants.InstanceOperation.SWAP_IN, -1); - // Validate that partitions on SWAP_OUT instance does not change after setting the InstanceOperation to SWAP_OUT - // and adding the SWAP_IN instance to the cluster. - // Check that the SWAP_IN instance has the same partitions as the SWAP_OUT instance + // Check that the SWAP_IN instance has the same partitions as the swap out instance // but none of them are in a top state. Assert.assertTrue(_bestPossibleClusterVerifier.verifyByPolling()); validateEVsCorrect(getEVs(), originalEVs, swapOutInstancesToSwapInInstances, ImmutableSet.of(instanceToSwapInName), Collections.emptySet()); - // Validate that the SWAP_OUT instance is in routing tables and SWAP_IN is not. + // Validate that the swap out instance is in routing tables and SWAP_IN is not. validateRoutingTablesInstance(getEVs(), instanceToSwapOutName, true); validateRoutingTablesInstance(getEVs(), instanceToSwapInName, false); @@ -808,32 +804,38 @@ public void testNodeSwapCancelSwapWhenReadyToComplete() throws Exception { Assert.assertTrue(_gSetupTool.getClusterManagementTool() .canCompleteSwap(CLUSTER_NAME, instanceToSwapOutName)); - // Cancel SWAP by disabling the SWAP_IN instance and remove SWAP_OUT InstanceOperation from SWAP_OUT instance. - _gSetupTool.getClusterManagementTool() - .enableInstance(CLUSTER_NAME, instanceToSwapInName, false); + // Cancel the swap by setting the InstanceOperation to UNKNOWN on the SWAP_IN instance. + _gSetupTool.getClusterManagementTool().setInstanceOperation(CLUSTER_NAME, instanceToSwapInName, + InstanceConstants.InstanceOperation.UNKNOWN); + + // Validate there are no partitions on the SWAP_IN instance. + Assert.assertTrue(_bestPossibleClusterVerifier.verifyByPolling()); + validateEVsCorrect(getEVs(), originalEVs, swapOutInstancesToSwapInInstances, + Collections.emptySet(), Collections.emptySet()); + // Stop the participant _participants.get(_participants.size() - 1).syncStop(); // Wait for cluster to converge. Assert.assertTrue(_clusterVerifier.verifyByPolling()); - // Validate that the SWAP_OUT instance is in routing tables and SWAP_IN is not. + // Validate that the swap out instance is in routing tables and SWAP_IN is not. validateRoutingTablesInstance(getEVs(), instanceToSwapOutName, true); validateRoutingTablesInstance(getEVs(), instanceToSwapInName, false); // Validate there are no partitions on the SWAP_IN instance. Assert.assertEquals(getPartitionsAndStatesOnInstance(getEVs(), instanceToSwapInName).size(), 0); - // Validate that the SWAP_OUT instance has the same partitions as it had before. + // Validate that the swap out instance has the same partitions as it had before. validateEVsCorrect(getEVs(), originalEVs, swapOutInstancesToSwapInInstances, Collections.emptySet(), Collections.emptySet()); _gSetupTool.getClusterManagementTool() - .setInstanceOperation(CLUSTER_NAME, instanceToSwapOutName, null); + .setInstanceOperation(CLUSTER_NAME, instanceToSwapOutName, InstanceConstants.InstanceOperation.ENABLE); Assert.assertTrue(_clusterVerifier.verifyByPolling()); - // Validate that the SWAP_OUT instance has the same partitions as it had before. + // Validate that the swap out instance has the same partitions as it had before. verifier(() -> (validateEVsCorrect(getEVs(), originalEVs, swapOutInstancesToSwapInInstances, Collections.emptySet(), Collections.emptySet())), TIMEOUT); } @@ -843,7 +845,7 @@ public void testNodeSwapAfterEMM() throws Exception { System.out.println("START TestInstanceOperation.testNodeSwapAfterEMM() at " + new Date( System.currentTimeMillis())); - removeOfflineOrDisabledOrSwapInInstances(); + removeOfflineOrInactiveInstances(); // Store original EVs Map originalEVs = getEVs(); @@ -854,14 +856,11 @@ public void testNodeSwapAfterEMM() throws Exception { _gSetupTool.getClusterManagementTool() .manuallyEnableMaintenanceMode(CLUSTER_NAME, true, null, null); - // Set instance's InstanceOperation to SWAP_OUT String instanceToSwapOutName = _participants.get(0).getInstanceName(); InstanceConfig instanceToSwapOutInstanceConfig = _gSetupTool.getClusterManagementTool() .getInstanceConfig(CLUSTER_NAME, instanceToSwapOutName); - _gSetupTool.getClusterManagementTool().setInstanceOperation(CLUSTER_NAME, instanceToSwapOutName, - InstanceConstants.InstanceOperation.SWAP_OUT); - // Validate that the assignment has not changed since setting the InstanceOperation to SWAP_OUT + // Validate that the assignment has not changed. Assert.assertTrue(_clusterVerifier.verifyByPolling()); validateEVsCorrect(getEVs(), originalEVs, swapOutInstancesToSwapInInstances, Collections.emptySet(), Collections.emptySet()); @@ -871,10 +870,10 @@ public void testNodeSwapAfterEMM() throws Exception { swapOutInstancesToSwapInInstances.put(instanceToSwapOutName, instanceToSwapInName); addParticipant(instanceToSwapInName, instanceToSwapOutInstanceConfig.getLogicalId(LOGICAL_ID), instanceToSwapOutInstanceConfig.getDomainAsMap().get(ZONE), - InstanceConstants.InstanceOperation.SWAP_IN, true, -1); + InstanceConstants.InstanceOperation.SWAP_IN, -1); // Validate that the assignment has not changed since adding the SWAP_IN node. - // During MM, the cluster should not compute new assignment. + // During MM, the cluster should not compute new assignment on SWAP_IN node. Assert.assertTrue(_bestPossibleClusterVerifier.verifyByPolling()); validateEVsCorrect(getEVs(), originalEVs, swapOutInstancesToSwapInInstances, Collections.emptySet(), Collections.emptySet()); @@ -884,14 +883,14 @@ public void testNodeSwapAfterEMM() throws Exception { _gSetupTool.getClusterManagementTool() .manuallyEnableMaintenanceMode(CLUSTER_NAME, false, null, null); - // Validate that partitions on SWAP_OUT instance does not change after exiting MM - // Check that the SWAP_IN instance has the same partitions as the SWAP_OUT instance + // Validate that partitions on swap out instance does not change after exiting MM + // Check that the SWAP_IN instance has the same partitions as the swap out instance // but none of them are in a top state. Assert.assertTrue(_bestPossibleClusterVerifier.verifyByPolling()); validateEVsCorrect(getEVs(), originalEVs, swapOutInstancesToSwapInInstances, ImmutableSet.of(instanceToSwapInName), Collections.emptySet()); - // Validate that the SWAP_OUT instance is in routing tables and SWAP_IN is not. + // Validate that the swap out instance is in routing tables and SWAP_IN is not. validateRoutingTablesInstance(getEVs(), instanceToSwapOutName, true); validateRoutingTablesInstance(getEVs(), instanceToSwapInName, false); @@ -907,11 +906,11 @@ public void testNodeSwapAfterEMM() throws Exception { // Validate that the SWAP_IN instance is now in the routing tables. validateRoutingTablesInstance(getEVs(), instanceToSwapInName, true); - // Assert that SWAP_OUT instance is disabled and has no partitions assigned to it. + // Assert that swap out instance is disabled and has no partitions assigned to it. Assert.assertFalse(_gSetupTool.getClusterManagementTool() .getInstanceConfig(CLUSTER_NAME, instanceToSwapOutName).getInstanceEnabled()); - // Validate that the SWAP_IN instance has the same partitions the SWAP_OUT instance had before + // Validate that the SWAP_IN instance has the same partitions the swap out instance had before // swap was completed. verifier(() -> (validateEVsCorrect(getEVs(), originalEVs, swapOutInstancesToSwapInInstances, Collections.emptySet(), ImmutableSet.of(instanceToSwapInName))), TIMEOUT); @@ -923,28 +922,25 @@ public void testNodeSwapWithSwapOutInstanceDisabled() throws Exception { "START TestInstanceOperation.testNodeSwapWithSwapOutInstanceDisabled() at " + new Date( System.currentTimeMillis())); - removeOfflineOrDisabledOrSwapInInstances(); + removeOfflineOrInactiveInstances(); // Store original EVs Map originalEVs = getEVs(); - // Set instance's InstanceOperation to SWAP_OUT String instanceToSwapOutName = _participants.get(0).getInstanceName(); InstanceConfig instanceToSwapOutInstanceConfig = _gSetupTool.getClusterManagementTool() .getInstanceConfig(CLUSTER_NAME, instanceToSwapOutName); - _gSetupTool.getClusterManagementTool().setInstanceOperation(CLUSTER_NAME, instanceToSwapOutName, - InstanceConstants.InstanceOperation.SWAP_OUT); Set swapOutInstanceOriginalPartitions = getPartitionsAndStatesOnInstance(originalEVs, instanceToSwapOutName).keySet(); - // Disable the SWAP_OUT instance. + // Disable the swap out instance. _gSetupTool.getClusterManagementTool() .enableInstance(CLUSTER_NAME, instanceToSwapOutName, false); Assert.assertTrue(_clusterVerifier.verifyByPolling()); - // Validate that the SWAP_OUT instance has all partitions in OFFLINE state + // Validate that the swap out instance has all partitions in OFFLINE state Set swapOutInstanceOfflineStates = new HashSet<>(getPartitionsAndStatesOnInstance(getEVs(), instanceToSwapOutName).values()); Assert.assertEquals(swapOutInstanceOfflineStates.size(), 1); @@ -954,21 +950,16 @@ public void testNodeSwapWithSwapOutInstanceDisabled() throws Exception { String instanceToSwapInName = PARTICIPANT_PREFIX + "_" + _nextStartPort; addParticipant(instanceToSwapInName, instanceToSwapOutInstanceConfig.getLogicalId(LOGICAL_ID), instanceToSwapOutInstanceConfig.getDomainAsMap().get(ZONE), - InstanceConstants.InstanceOperation.SWAP_IN, true, -1); + InstanceConstants.InstanceOperation.SWAP_IN, -1); Assert.assertTrue(_bestPossibleClusterVerifier.verifyByPolling()); - // Validate that the SWAP_IN instance has the same partitions in secondTopState as the SWAP_OUT instance - // did before being disabled. + // Validate that the SWAP_IN instance has no partitions because the swap started when the swap out node was offline Map swapInInstancePartitionsAndStates = getPartitionsAndStatesOnInstance(getEVs(), instanceToSwapInName); - Assert.assertTrue( - swapInInstancePartitionsAndStates.keySet().containsAll(swapOutInstanceOriginalPartitions)); - Set swapInInstanceStates = new HashSet<>(swapInInstancePartitionsAndStates.values()); - swapInInstanceStates.removeAll(SECONDARY_STATE_SET); - Assert.assertEquals(swapInInstanceStates.size(), 0); + Assert.assertEquals(swapInInstancePartitionsAndStates.size(), 0); - // Assert canSwapBeCompleted is false because SWAP_OUT instance is disabled. + // Assert canSwapBeCompleted is false because swap out instance is disabled. Assert.assertTrue(_gSetupTool.getClusterManagementTool() .canCompleteSwap(CLUSTER_NAME, instanceToSwapOutName)); @@ -976,9 +967,9 @@ public void testNodeSwapWithSwapOutInstanceDisabled() throws Exception { Assert.assertTrue(_gSetupTool.getClusterManagementTool() .completeSwapIfPossible(CLUSTER_NAME, instanceToSwapOutName, false)); - Assert.assertTrue(_clusterVerifier.verifyByPolling()); + Assert.assertTrue(_bestPossibleClusterVerifier.verifyByPolling()); - // Assert that SWAP_OUT instance is disabled and has no partitions assigned to it. + // Assert that swap out instance is disabled and has no partitions assigned to it. Assert.assertFalse(_gSetupTool.getClusterManagementTool() .getInstanceConfig(CLUSTER_NAME, instanceToSwapOutName).getInstanceEnabled()); @@ -993,28 +984,23 @@ public void testNodeSwapWithSwapOutInstanceOffline() throws Exception { "START TestInstanceOperation.testNodeSwapWithSwapOutInstanceOffline() at " + new Date( System.currentTimeMillis())); - removeOfflineOrDisabledOrSwapInInstances(); + removeOfflineOrInactiveInstances(); // Store original EV Map originalEVs = getEVs(); Map swapOutInstancesToSwapInInstances = new HashMap<>(); - // Set instance's InstanceOperation to SWAP_OUT String instanceToSwapOutName = _participants.get(0).getInstanceName(); InstanceConfig instanceToSwapOutInstanceConfig = _gSetupTool.getClusterManagementTool() .getInstanceConfig(CLUSTER_NAME, instanceToSwapOutName); - _gSetupTool.getClusterManagementTool().setInstanceOperation(CLUSTER_NAME, instanceToSwapOutName, - InstanceConstants.InstanceOperation.SWAP_OUT); - - Assert.assertTrue(_clusterVerifier.verifyByPolling()); // Add instance with InstanceOperation set to SWAP_IN String instanceToSwapInName = PARTICIPANT_PREFIX + "_" + _nextStartPort; swapOutInstancesToSwapInInstances.put(instanceToSwapOutName, instanceToSwapInName); addParticipant(instanceToSwapInName, instanceToSwapOutInstanceConfig.getLogicalId(LOGICAL_ID), instanceToSwapOutInstanceConfig.getDomainAsMap().get(ZONE), - InstanceConstants.InstanceOperation.SWAP_IN, true, -1); + InstanceConstants.InstanceOperation.SWAP_IN, -1); // Kill the participant _participants.get(0).syncStop(); @@ -1025,7 +1011,7 @@ public void testNodeSwapWithSwapOutInstanceOffline() throws Exception { Assert.assertTrue(_gSetupTool.getClusterManagementTool() .canCompleteSwap(CLUSTER_NAME, instanceToSwapOutName)); - // Validate that the SWAP_OUT instance is in routing tables and SWAP_IN is not. + // Validate that the swap out instance is in routing tables and SWAP_IN is not. validateRoutingTablesInstance(getEVs(), instanceToSwapInName, false); // Assert completeSwapIfPossible is true @@ -1037,11 +1023,11 @@ public void testNodeSwapWithSwapOutInstanceOffline() throws Exception { // Validate that the SWAP_IN instance is now in the routing tables. validateRoutingTablesInstance(getEVs(), instanceToSwapInName, true); - // Assert that SWAP_OUT instance is disabled and has no partitions assigned to it. + // Assert that swap out instance is inactive and has no partitions assigned to it. Assert.assertFalse(_gSetupTool.getClusterManagementTool() .getInstanceConfig(CLUSTER_NAME, instanceToSwapOutName).getInstanceEnabled()); - // Validate that the SWAP_IN instance has the same partitions the SWAP_OUT instance had before + // Validate that the SWAP_IN instance has the same partitions the swap out instance had before // swap was completed. verifier(() -> (validateEVsCorrect(getEVs(), originalEVs, swapOutInstancesToSwapInInstances, Collections.emptySet(), ImmutableSet.of(instanceToSwapInName))), TIMEOUT); @@ -1051,7 +1037,7 @@ public void testNodeSwapWithSwapOutInstanceOffline() throws Exception { public void testSwapEvacuateAdd() throws Exception { System.out.println("START TestInstanceOperation.testSwapEvacuateAdd() at " + new Date( System.currentTimeMillis())); - removeOfflineOrDisabledOrSwapInInstances(); + removeOfflineOrInactiveInstances(); // Store original EV Map originalEVs = getEVs(); @@ -1062,6 +1048,8 @@ public void testSwapEvacuateAdd() throws Exception { _gSetupTool.getClusterManagementTool() .manuallyEnableMaintenanceMode(CLUSTER_NAME, true, null, null); + Assert.assertTrue(_clusterVerifier.verifyByPolling()); + // Set instance's InstanceOperation to EVACUATE String instanceToSwapOutName = _participants.get(0).getInstanceName(); InstanceConfig instanceToSwapOutInstanceConfig = _gSetupTool.getClusterManagementTool() @@ -1073,11 +1061,12 @@ public void testSwapEvacuateAdd() throws Exception { validateEVsCorrect(getEVs(), originalEVs, swapOutInstancesToSwapInInstances, Collections.emptySet(), Collections.emptySet()); - // Add instance with InstanceOperation set to SWAP_IN + // Add instance with InstanceOperation set to ENABLE String instanceToSwapInName = PARTICIPANT_PREFIX + "_" + _nextStartPort; swapOutInstancesToSwapInInstances.put(instanceToSwapOutName, instanceToSwapInName); addParticipant(instanceToSwapInName, instanceToSwapOutInstanceConfig.getLogicalId(LOGICAL_ID), - instanceToSwapOutInstanceConfig.getDomainAsMap().get(ZONE), null, true, -1); + instanceToSwapOutInstanceConfig.getDomainAsMap().get(ZONE), + InstanceConstants.InstanceOperation.ENABLE, -1); // Exit maintenance mode _gSetupTool.getClusterManagementTool() @@ -1085,7 +1074,7 @@ public void testSwapEvacuateAdd() throws Exception { Assert.assertTrue(_bestPossibleClusterVerifier.verifyByPolling()); - // Validate that the SWAP_IN instance has the same partitions the SWAP_OUT instance had. + // Validate that the SWAP_IN instance has the same partitions the swap out instance had. verifier(() -> (validateEVsCorrect(getEVs(), originalEVs, swapOutInstancesToSwapInInstances, Collections.emptySet(), ImmutableSet.of(instanceToSwapInName))), TIMEOUT); @@ -1093,9 +1082,9 @@ public void testSwapEvacuateAdd() throws Exception { Assert.assertTrue(_gSetupTool.getClusterManagementTool() .isEvacuateFinished(CLUSTER_NAME, instanceToSwapOutName)); - // Disable the EVACUATE instance - _gSetupTool.getClusterManagementTool() - .enableInstance(CLUSTER_NAME, instanceToSwapOutName, false); + // Set the EVACUATE instance to UNKNOWN + _gSetupTool.getClusterManagementTool().setInstanceOperation(CLUSTER_NAME, instanceToSwapOutName, + InstanceConstants.InstanceOperation.UNKNOWN); Assert.assertTrue(_clusterVerifier.verifyByPolling()); @@ -1105,31 +1094,13 @@ public void testSwapEvacuateAdd() throws Exception { } @Test(expectedExceptions = HelixException.class, dependsOnMethods = "testSwapEvacuateAdd") - public void testNodeSwapAddSwapInFirstEnabledBeforeSwapOutSet() throws Exception { + public void testUnsetInstanceOperationOnSwapInWhenSwapping() throws Exception { System.out.println( - "START TestInstanceOperation.testNodeSwapAddSwapInFirstEnabledBeforeSwapOutSet() at " + "START TestInstanceOperation.testUnsetInstanceOperationOnSwapInWhenSwapping() at " + new Date(System.currentTimeMillis())); - removeOfflineOrDisabledOrSwapInInstances(); - // Get the SWAP_OUT instance. - String instanceToSwapOutName = _participants.get(0).getInstanceName(); - InstanceConfig instanceToSwapOutInstanceConfig = _gSetupTool.getClusterManagementTool() - .getInstanceConfig(CLUSTER_NAME, instanceToSwapOutName); - - // Add instance with InstanceOperation set to SWAP_IN enabled before setting SWAP_OUT instance. - String instanceToSwapInName = PARTICIPANT_PREFIX + "_" + _nextStartPort; - addParticipant(instanceToSwapInName, instanceToSwapOutInstanceConfig.getLogicalId(LOGICAL_ID), - instanceToSwapOutInstanceConfig.getDomainAsMap().get(ZONE), null, true, -1); - } + removeOfflineOrInactiveInstances(); - @Test(expectedExceptions = HelixException.class, dependsOnMethods = "testNodeSwapAddSwapInFirstEnabledBeforeSwapOutSet") - public void testNodeSwapAddSwapInFirstEnableBeforeSwapOutSet() throws Exception { - System.out.println( - "START TestInstanceOperation.testNodeSwapAddSwapInFirstEnableBeforeSwapOutSet() at " - + new Date(System.currentTimeMillis())); - removeOfflineOrDisabledOrSwapInInstances(); - - // Get the SWAP_OUT instance. String instanceToSwapOutName = _participants.get(0).getInstanceName(); InstanceConfig instanceToSwapOutInstanceConfig = _gSetupTool.getClusterManagementTool() .getInstanceConfig(CLUSTER_NAME, instanceToSwapOutName); @@ -1137,49 +1108,27 @@ public void testNodeSwapAddSwapInFirstEnableBeforeSwapOutSet() throws Exception // Add instance with InstanceOperation set to SWAP_IN String instanceToSwapInName = PARTICIPANT_PREFIX + "_" + _nextStartPort; addParticipant(instanceToSwapInName, instanceToSwapOutInstanceConfig.getLogicalId(LOGICAL_ID), - instanceToSwapOutInstanceConfig.getDomainAsMap().get(ZONE), null, false, -1); - - Assert.assertTrue(_bestPossibleClusterVerifier.verifyByPolling()); - - // Enable the SWAP_IN instance before we have set the SWAP_OUT instance. - _gSetupTool.getClusterManagementTool().enableInstance(CLUSTER_NAME, instanceToSwapInName, true); - } - - @Test(expectedExceptions = HelixException.class, dependsOnMethods = "testNodeSwapAddSwapInFirstEnableBeforeSwapOutSet") - public void testUnsetInstanceOperationOnSwapInWhenAlreadyUnsetOnSwapOut() throws Exception { - System.out.println( - "START TestInstanceOperation.testUnsetInstanceOperationOnSwapInWhenAlreadyUnsetOnSwapOut() at " - + new Date(System.currentTimeMillis())); - removeOfflineOrDisabledOrSwapInInstances(); - - // Get the SWAP_OUT instance. - String instanceToSwapOutName = _participants.get(0).getInstanceName(); - InstanceConfig instanceToSwapOutInstanceConfig = _gSetupTool.getClusterManagementTool() - .getInstanceConfig(CLUSTER_NAME, instanceToSwapOutName); - - // Add instance with InstanceOperation set to SWAP_IN - String instanceToSwapInName = PARTICIPANT_PREFIX + "_" + _nextStartPort; - addParticipant(instanceToSwapInName, instanceToSwapOutInstanceConfig.getLogicalId(LOGICAL_ID), - instanceToSwapOutInstanceConfig.getDomainAsMap().get(ZONE), null, false, -1); + instanceToSwapOutInstanceConfig.getDomainAsMap().get(ZONE), + InstanceConstants.InstanceOperation.SWAP_IN, -1); Assert.assertTrue(_bestPossibleClusterVerifier.verifyByPolling()); - // Try to remove the InstanceOperation from the SWAP_IN instance before the SWAP_OUT instance is set. + // Try to remove the InstanceOperation from the SWAP_IN instance before swap in instance is set to unknown. // This should throw exception because we cannot ever have two instances with the same logicalId and both have InstanceOperation // unset. _gSetupTool.getClusterManagementTool() - .setInstanceOperation(CLUSTER_NAME, instanceToSwapInName, null); + .setInstanceOperation(CLUSTER_NAME, instanceToSwapInName, InstanceConstants.InstanceOperation.ENABLE); } - @Test(expectedExceptions = HelixException.class, dependsOnMethods = "testUnsetInstanceOperationOnSwapInWhenAlreadyUnsetOnSwapOut") + @Test(dependsOnMethods = "testUnsetInstanceOperationOnSwapInWhenSwapping") public void testNodeSwapAddSwapInFirst() throws Exception { System.out.println("START TestInstanceOperation.testNodeSwapAddSwapInFirst() at " + new Date( System.currentTimeMillis())); - removeOfflineOrDisabledOrSwapInInstances(); + removeOfflineOrInactiveInstances(); // Store original EV Map originalEVs = getEVs(); - // Get the SWAP_OUT instance. + // Get the swap out instance. String instanceToSwapOutName = _participants.get(0).getInstanceName(); InstanceConfig instanceToSwapOutInstanceConfig = _gSetupTool.getClusterManagementTool() .getInstanceConfig(CLUSTER_NAME, instanceToSwapOutName); @@ -1187,7 +1136,8 @@ public void testNodeSwapAddSwapInFirst() throws Exception { // Add instance with InstanceOperation set to SWAP_IN String instanceToSwapInName = PARTICIPANT_PREFIX + "_" + _nextStartPort; addParticipant(instanceToSwapInName, instanceToSwapOutInstanceConfig.getLogicalId(LOGICAL_ID), - instanceToSwapOutInstanceConfig.getDomainAsMap().get(ZONE), null, false, -1); + instanceToSwapOutInstanceConfig.getDomainAsMap().get(ZONE), + InstanceConstants.InstanceOperation.SWAP_IN, -1); } @Test(dependsOnMethods = "testNodeSwapAddSwapInFirst") @@ -1195,7 +1145,8 @@ public void testEvacuateAndCancelBeforeBootstrapFinish() throws Exception { System.out.println( "START TestInstanceOperation.testEvacuateAndCancelBeforeBootstrapFinish() at " + new Date( System.currentTimeMillis())); - removeOfflineOrDisabledOrSwapInInstances(); + + removeOfflineOrInactiveInstances(); // add a resource where downward state transition is slow createResourceWithDelayedRebalance(CLUSTER_NAME, "TEST_DB3_DELAYED_CRUSHED", "MasterSlave", @@ -1236,7 +1187,7 @@ public void testEvacuateAndCancelBeforeBootstrapFinish() throws Exception { // cancel the evacuation _gSetupTool.getClusterManagementTool() - .setInstanceOperation(CLUSTER_NAME, instanceToEvacuate, null); + .setInstanceOperation(CLUSTER_NAME, instanceToEvacuate, InstanceConstants.InstanceOperation.ENABLE); assignment = getEVs(); for (String resource : _allDBs) { @@ -1278,7 +1229,7 @@ public void testEvacuateAndCancelBeforeDropFinish() throws Exception { // cancel evacuation _gSetupTool.getClusterManagementTool() - .setInstanceOperation(CLUSTER_NAME, instanceToEvacuate, null); + .setInstanceOperation(CLUSTER_NAME, instanceToEvacuate, InstanceConstants.InstanceOperation.ENABLE); // check every replica has >= 3 active replicas, even before cluster converge Map assignment = getEVs(); for (String resource : _allDBs) { @@ -1346,7 +1297,31 @@ public void testMarkEvacuationAfterEMM() throws Exception { _stateModelDelay = 3L; } - @Test(dependsOnMethods = "testMarkEvacuationAfterEMM") + @Test(expectedExceptions = HelixException.class, dependsOnMethods = "testMarkEvacuationAfterEMM") + public void testSwapEvacuateAddRemoveEvacuate() throws Exception { + System.out.println("START TestInstanceOperation.testSwapEvacuateAddRemoveEvacuate() at " + new Date( + System.currentTimeMillis())); + removeOfflineOrInactiveInstances(); + + // Set instance's InstanceOperation to EVACUATE + String instanceToSwapOutName = _participants.get(0).getInstanceName(); + InstanceConfig instanceToSwapOutInstanceConfig = _gSetupTool.getClusterManagementTool() + .getInstanceConfig(CLUSTER_NAME, instanceToSwapOutName); + _gSetupTool.getClusterManagementTool().setInstanceOperation(CLUSTER_NAME, instanceToSwapOutName, + InstanceConstants.InstanceOperation.EVACUATE); + + // Add instance with InstanceOperation set to ENABLE + String instanceToSwapInName = PARTICIPANT_PREFIX + "_" + _nextStartPort; + addParticipant(instanceToSwapInName, instanceToSwapOutInstanceConfig.getLogicalId(LOGICAL_ID), + instanceToSwapOutInstanceConfig.getDomainAsMap().get(ZONE), + InstanceConstants.InstanceOperation.ENABLE, -1); + + // Remove EVACUATE instance's InstanceOperation + _gSetupTool.getClusterManagementTool() + .setInstanceOperation(CLUSTER_NAME, instanceToSwapOutName, InstanceConstants.InstanceOperation.ENABLE); + } + + @Test(dependsOnMethods = "testSwapEvacuateAddRemoveEvacuate") public void testEvacuationWithOfflineInstancesInCluster() throws Exception { System.out.println( "START TestInstanceOperation.testEvacuationWithOfflineInstancesInCluster() at " + new Date( @@ -1358,11 +1333,10 @@ public void testEvacuationWithOfflineInstancesInCluster() throws Exception { _gSetupTool.getClusterManagementTool().setInstanceOperation(CLUSTER_NAME, evacuateInstanceName, InstanceConstants.InstanceOperation.EVACUATE); - Map assignment; // EV should contain all participants, check resources one by one - assignment = getEVs(); - for (String resource : _allDBs) { - verifier(() -> { + verifier(() -> { + Map assignment = getEVs(); + for (String resource : _allDBs) { ExternalView ev = assignment.get(resource); for (String partition : ev.getPartitionSet()) { AtomicInteger activeReplicaCount = new AtomicInteger(); @@ -1372,44 +1346,21 @@ public void testEvacuationWithOfflineInstancesInCluster() throws Exception { .forEach(v -> activeReplicaCount.getAndIncrement()); if (activeReplicaCount.get() < REPLICA - 1 || ( ev.getStateMap(partition).containsKey(evacuateInstanceName) && ev.getStateMap( - partition).get(evacuateInstanceName).equals("MASTER") && ev.getStateMap(partition) - .get(evacuateInstanceName).equals("LEADER"))) { + partition).get(evacuateInstanceName).equals("MASTER") && ev.getStateMap( + partition).get(evacuateInstanceName).equals("LEADER"))) { return false; } } - return true; - }, 30000); - } + } + return true; + }, 30000); - removeOfflineOrDisabledOrSwapInInstances(); + removeOfflineOrInactiveInstances(); addParticipant(PARTICIPANT_PREFIX + "_" + _nextStartPort); addParticipant(PARTICIPANT_PREFIX + "_" + _nextStartPort); dropTestDBs(ImmutableSet.of("TEST_DB3_DELAYED_CRUSHED", "TEST_DB4_DELAYED_WAGED")); } - @Test(expectedExceptions = HelixException.class, dependsOnMethods = "testEvacuationWithOfflineInstancesInCluster") - public void testSwapEvacuateAddRemoveEvacuate() throws Exception { - System.out.println("START TestInstanceOperation.testSwapEvacuateAddRemoveEvacuate() at " + new Date( - System.currentTimeMillis())); - removeOfflineOrDisabledOrSwapInInstances(); - - // Set instance's InstanceOperation to EVACUATE - String instanceToSwapOutName = _participants.get(0).getInstanceName(); - InstanceConfig instanceToSwapOutInstanceConfig = _gSetupTool.getClusterManagementTool() - .getInstanceConfig(CLUSTER_NAME, instanceToSwapOutName); - _gSetupTool.getClusterManagementTool().setInstanceOperation(CLUSTER_NAME, instanceToSwapOutName, - InstanceConstants.InstanceOperation.EVACUATE); - - // Add instance with InstanceOperation set to SWAP_IN - String instanceToSwapInName = PARTICIPANT_PREFIX + "_" + _nextStartPort; - addParticipant(instanceToSwapInName, instanceToSwapOutInstanceConfig.getLogicalId(LOGICAL_ID), - instanceToSwapOutInstanceConfig.getDomainAsMap().get(ZONE), null, true, -1); - - // Remove EVACUATE instance's InstanceOperation - _gSetupTool.getClusterManagementTool() - .setInstanceOperation(CLUSTER_NAME, instanceToSwapOutName, null); - } - /** * Verifies that the given verifier returns true within the given timeout. Handles AssertionError * by returning false, which TestHelper.verify will not do. Asserts that return value from @@ -1448,10 +1399,10 @@ public boolean isThrottlesEnabled() { @Override public void onInstanceConfigChange(List instanceConfig, NotificationContext context) { - if (instanceConfig.get(0).getInstanceOperation() - .equals(InstanceConstants.InstanceOperation.SWAP_IN.name())) { + if (instanceConfig.get(0).getInstanceOperation().getOperation() + .equals(InstanceConstants.InstanceOperation.SWAP_IN)) { throttlesEnabled = false; - } else if (instanceConfig.get(0).getInstanceOperation().isEmpty()) { + } else { throttlesEnabled = true; } } @@ -1470,21 +1421,21 @@ private MockParticipantManager createParticipant(String participantName) throws private void addParticipant(String participantName) throws Exception { addParticipant(participantName, UUID.randomUUID().toString(), - "zone_" + _participants.size() % ZONE_COUNT, null, true, -1); + "zone_" + _participants.size() % ZONE_COUNT, null, -1); } private void addParticipant(String participantName, String logicalId, String zone, - InstanceConstants.InstanceOperation instanceOperation, boolean enabled, int capacity) + InstanceConstants.InstanceOperation instanceOperation, int capacity) throws Exception { - addParticipant(participantName, logicalId, zone, instanceOperation, enabled, capacity, null); + addParticipant(participantName, logicalId, zone, instanceOperation, capacity, null); } private void addParticipant(String participantName, String logicalId, String zone, - InstanceConstants.InstanceOperation instanceOperation, boolean enabled, int capacity, + InstanceConstants.InstanceOperation instanceOperation, int capacity, InstanceConfigChangeListener listener) throws Exception { InstanceConfig config = new InstanceConfig.Builder().setDomain( String.format("%s=%s, %s=%s, %s=%s", ZONE, zone, HOST, participantName, LOGICAL_ID, - logicalId)).setInstanceEnabled(enabled).setInstanceOperation(instanceOperation) + logicalId)).setInstanceOperation(instanceOperation) .build(participantName); if (capacity >= 0) { diff --git a/helix-core/src/test/java/org/apache/helix/integration/rebalancer/WagedRebalancer/TestWagedExpandCluster.java b/helix-core/src/test/java/org/apache/helix/integration/rebalancer/WagedRebalancer/TestWagedExpandCluster.java index 697847c29c..9cc4eea52f 100644 --- a/helix-core/src/test/java/org/apache/helix/integration/rebalancer/WagedRebalancer/TestWagedExpandCluster.java +++ b/helix-core/src/test/java/org/apache/helix/integration/rebalancer/WagedRebalancer/TestWagedExpandCluster.java @@ -22,6 +22,7 @@ import java.util.HashMap; import java.util.Map; +import org.apache.helix.constants.InstanceConstants; import org.apache.helix.integration.manager.MockParticipantManager; import org.apache.helix.integration.rebalancer.PartitionMigration.TestPartitionMigrationBase; import org.apache.helix.model.ClusterConfig; @@ -103,7 +104,7 @@ public void testClusterExpansionByEnableInstance() for (int i = numNodes; i < numNodes + NUM_NODE; i++) { String storageNodeName = PARTICIPANT_PREFIX + "_" + (START_PORT + i); InstanceConfig config = InstanceConfig.toInstanceConfig(storageNodeName); - config.setInstanceEnabled(false); + config.setInstanceOperation(InstanceConstants.InstanceOperation.DISABLE); config.getRecord().getSimpleFields() .remove(InstanceConfig.InstanceConfigProperty.HELIX_ENABLED_TIMESTAMP.name()); diff --git a/helix-core/src/test/java/org/apache/helix/integration/rebalancer/WagedRebalancer/TestWagedNodeSwap.java b/helix-core/src/test/java/org/apache/helix/integration/rebalancer/WagedRebalancer/TestWagedNodeSwap.java index f6ef8279dc..fbb7304509 100644 --- a/helix-core/src/test/java/org/apache/helix/integration/rebalancer/WagedRebalancer/TestWagedNodeSwap.java +++ b/helix-core/src/test/java/org/apache/helix/integration/rebalancer/WagedRebalancer/TestWagedNodeSwap.java @@ -32,6 +32,7 @@ import org.apache.helix.ConfigAccessor; import org.apache.helix.TestHelper; import org.apache.helix.common.ZkTestBase; +import org.apache.helix.constants.InstanceConstants; import org.apache.helix.integration.manager.ClusterControllerManager; import org.apache.helix.integration.manager.MockParticipantManager; import org.apache.helix.model.BuiltInStateModelDefinitions; @@ -156,7 +157,7 @@ public void testNodeSwap() throws Exception { String oldParticipantName = oldParticipant.getInstanceName(); final InstanceConfig instanceConfig = _gSetupTool.getClusterManagementTool().getInstanceConfig(CLUSTER_NAME, oldParticipantName); - instanceConfig.setInstanceEnabled(false); + instanceConfig.setInstanceOperation(InstanceConstants.InstanceOperation.DISABLE); _gSetupTool.getClusterManagementTool() .setInstanceConfig(CLUSTER_NAME, oldParticipantName, instanceConfig); Assert.assertTrue(_clusterVerifier.verify(10000)); @@ -231,7 +232,7 @@ public void testFaultZoneSwap() throws Exception { InstanceConfig instanceConfig = _gSetupTool.getClusterManagementTool().getInstanceConfig(CLUSTER_NAME, instanceName); if (instanceConfig.getDomainAsMap().get("zone").equals(randZoneStr)) { - instanceConfig.setInstanceEnabled(false); + instanceConfig.setInstanceOperation(InstanceConstants.InstanceOperation.DISABLE); _gSetupTool.getClusterManagementTool() .setInstanceConfig(CLUSTER_NAME, instanceName, instanceConfig); removedInstanceConfigMap.put(instanceName, instanceConfig); diff --git a/helix-core/src/test/java/org/apache/helix/integration/rebalancer/WagedRebalancer/TestWagedRebalance.java b/helix-core/src/test/java/org/apache/helix/integration/rebalancer/WagedRebalancer/TestWagedRebalance.java index a7250d4804..26eb13d7a6 100644 --- a/helix-core/src/test/java/org/apache/helix/integration/rebalancer/WagedRebalancer/TestWagedRebalance.java +++ b/helix-core/src/test/java/org/apache/helix/integration/rebalancer/WagedRebalancer/TestWagedRebalance.java @@ -34,6 +34,7 @@ import org.apache.helix.HelixException; import org.apache.helix.TestHelper; import org.apache.helix.common.ZkTestBase; +import org.apache.helix.constants.InstanceConstants; import org.apache.helix.controller.rebalancer.strategy.CrushEdRebalanceStrategy; import org.apache.helix.controller.rebalancer.strategy.CrushRebalanceStrategy; import org.apache.helix.controller.rebalancer.util.RebalanceScheduler; @@ -387,7 +388,7 @@ public void testDisableInstance() throws InterruptedException { disableParticipants.add(p.getInstanceName()); InstanceConfig config = _gSetupTool.getClusterManagementTool() .getInstanceConfig(CLUSTER_NAME, p.getInstanceName()); - config.setInstanceEnabled(false); + config.setInstanceOperation(InstanceConstants.InstanceOperation.DISABLE); _gSetupTool.getClusterManagementTool() .setInstanceConfig(CLUSTER_NAME, p.getInstanceName(), config); } @@ -408,7 +409,7 @@ public void testDisableInstance() throws InterruptedException { for (String instanceName : disableParticipants) { InstanceConfig config = _gSetupTool.getClusterManagementTool().getInstanceConfig(CLUSTER_NAME, instanceName); - config.setInstanceEnabled(true); + config.setInstanceOperation(InstanceConstants.InstanceOperation.ENABLE); _gSetupTool.getClusterManagementTool() .setInstanceConfig(CLUSTER_NAME, instanceName, config); } diff --git a/helix-core/src/test/java/org/apache/helix/integration/task/TestTaskSchedulingTwoCurrentStates.java b/helix-core/src/test/java/org/apache/helix/integration/task/TestTaskSchedulingTwoCurrentStates.java index ada5157a3b..4181c4b822 100644 --- a/helix-core/src/test/java/org/apache/helix/integration/task/TestTaskSchedulingTwoCurrentStates.java +++ b/helix-core/src/test/java/org/apache/helix/integration/task/TestTaskSchedulingTwoCurrentStates.java @@ -96,7 +96,7 @@ public void afterClass() throws Exception { super.afterClass(); } - @Test + @Test (enabled = false) public void testTargetedTaskTwoCurrentStates() throws Exception { _gSetupTool.addResourceToCluster(CLUSTER_NAME, DATABASE, _numPartitions, MASTER_SLAVE_STATE_MODEL, IdealState.RebalanceMode.SEMI_AUTO.name()); diff --git a/helix-core/src/test/java/org/apache/helix/manager/zk/TestZkHelixAdmin.java b/helix-core/src/test/java/org/apache/helix/manager/zk/TestZkHelixAdmin.java index 59decd98e5..b54be8c045 100644 --- a/helix-core/src/test/java/org/apache/helix/manager/zk/TestZkHelixAdmin.java +++ b/helix-core/src/test/java/org/apache/helix/manager/zk/TestZkHelixAdmin.java @@ -182,8 +182,7 @@ public void testZkHelixAdmin() { String disableReason = "Reason"; tool.enableInstance(clusterName, instanceName, false, InstanceConstants.InstanceDisabledType.CLOUD_EVENT, disableReason); - Assert.assertTrue(tool.getInstanceConfig(clusterName, instanceName).getInstanceDisabledReason() - .equals(disableReason)); + Assert.assertEquals(disableReason, tool.getInstanceConfig(clusterName, instanceName).getInstanceDisabledReason()); tool.enableInstance(clusterName, instanceName, true, InstanceConstants.InstanceDisabledType.CLOUD_EVENT, disableReason); Assert.assertTrue( @@ -348,6 +347,65 @@ public void testZkHelixAdmin() { System.out.println("END testZkHelixAdmin at " + new Date(System.currentTimeMillis())); } + @Test + private void testSetInstanceOperation() { + System.out.println("START testSetInstanceOperation at " + new Date(System.currentTimeMillis())); + + final String clusterName = getShortClassName(); + String rootPath = "/" + clusterName; + if (_gZkClient.exists(rootPath)) { + _gZkClient.deleteRecursively(rootPath); + } + + HelixAdmin tool = new ZKHelixAdmin(_gZkClient); + tool.addCluster(clusterName, true); + Assert.assertTrue(ZKUtil.isClusterSetup(clusterName, _gZkClient)); + Assert.assertTrue(_gZkClient.exists(PropertyPathBuilder.customizedStateConfig(clusterName))); + + // Add instance to cluster + String hostname = "host1"; + String port = "9999"; + String instanceName = hostname + "_" + port; + InstanceConfig config = + new InstanceConfig.Builder().setHostName(hostname).setPort(port).build(instanceName); + + tool.addInstance(clusterName, config); + + // Set instance operation to DISABLE + tool.setInstanceOperation(clusterName, instanceName, + InstanceConstants.InstanceOperation.DISABLE, "disableReason"); + Assert.assertEquals(tool.getInstanceConfig(clusterName, instanceName).getInstanceOperation() + .getOperation(), + InstanceConstants.InstanceOperation.DISABLE); + Assert.assertEquals( + tool.getInstanceConfig(clusterName, instanceName).getInstanceDisabledReason(), + "disableReason"); + + // Set instance operation to ENABLE + tool.setInstanceOperation(clusterName, instanceName, InstanceConstants.InstanceOperation.ENABLE, + "enableReason"); + Assert.assertEquals(tool.getInstanceConfig(clusterName, instanceName).getInstanceOperation() + .getOperation(), + InstanceConstants.InstanceOperation.ENABLE); + // InstanceNonServingReason should be empty after setting operation to ENABLE + Assert.assertEquals( + tool.getInstanceConfig(clusterName, instanceName).getInstanceDisabledReason(), ""); + + // Set instance operation to UNKNOWN + tool.setInstanceOperation(clusterName, instanceName, + InstanceConstants.InstanceOperation.UNKNOWN, "unknownReason"); + Assert.assertEquals(tool.getInstanceConfig(clusterName, instanceName).getInstanceOperation() + .getOperation(), + InstanceConstants.InstanceOperation.UNKNOWN); + Assert.assertEquals( + tool.getInstanceConfig(clusterName, instanceName).getInstanceOperation().getReason(), + "unknownReason"); + + deleteCluster(clusterName); + + System.out.println("END testSetInstanceOperation at " + new Date(System.currentTimeMillis())); + } + private HelixManager initializeHelixManager(String clusterName, String instanceName) { HelixManager manager = HelixManagerFactory.getZKHelixManager(clusterName, instanceName, InstanceType.PARTICIPANT, org.apache.helix.common.ZkTestBase.ZK_ADDR); @@ -589,6 +647,117 @@ public void testLegacyEnableDisablePartition() { 2); } + @Test(description = "Unit test for sanity check in setPartitionsToError()") + public void testSetPartitionsToError() throws Exception { + String className = TestHelper.getTestClassName(); + String methodName = TestHelper.getTestMethodName(); + String clusterName = className + "_" + methodName; + String instanceName = "TestInstance"; + String testResource = "TestResource"; + String wrongTestInstance = "WrongTestInstance"; + String wrongTestResource = "WrongTestResource"; + System.out.println("START " + clusterName + " at " + new Date(System.currentTimeMillis())); + HelixAdmin admin = new ZKHelixAdmin(_gZkClient); + admin.addCluster(clusterName, true); + admin.addInstance(clusterName, new InstanceConfig(instanceName)); + admin.enableInstance(clusterName, instanceName, true); + InstanceConfig instanceConfig = admin.getInstanceConfig(clusterName, instanceName); + + IdealState idealState = new IdealState(testResource); + idealState.setNumPartitions(3); + admin.addStateModelDef(clusterName, "MasterSlave", new MasterSlaveSMD()); + idealState.setStateModelDefRef("MasterSlave"); + idealState.setRebalanceMode(IdealState.RebalanceMode.FULL_AUTO); + admin.addResource(clusterName, testResource, idealState); + admin.enableResource(clusterName, testResource, true); + + /* + * This is a unit test for sanity check in setPartitionsToError(). + * There is no running controller in this test. We have end-to-end tests for + * setPartitionsToError() + * under integration/TestSetPartitionsToError. + */ + // setPartitionsToError is expected to throw an exception when provided with a nonexistent + // instance. + try { + admin.setPartitionsToError(clusterName, wrongTestInstance, testResource, + Arrays.asList("1", "2")); + Assert.fail("Should throw HelixException"); + } catch (HelixException expected) { + // This exception is expected because the instance name is made up. + Assert.assertEquals(expected.getMessage(), String.format( + "Can't SET_TO_ERROR state for %s.[1, 2] on WrongTestInstance, because %s does not exist in cluster %s", + testResource, wrongTestInstance, clusterName)); + } + + // setPartitionsToError is expected to throw an exception when provided with a non-live + // instance. + try { + admin.setPartitionsToError(clusterName, instanceName, testResource, Arrays.asList("1", "2")); + Assert.fail("Should throw HelixException"); + } catch (HelixException expected) { + // This exception is expected because the instance is not alive. + Assert.assertEquals(expected.getMessage(), + String.format( + "Can't SET_TO_ERROR state for %s.[1, 2] on %s, because %s is not alive in cluster %s", + testResource, instanceName, instanceName, clusterName)); + } + + HelixManager manager = initializeHelixManager(clusterName, instanceConfig.getInstanceName()); + manager.connect(); + + // setPartitionsToError is expected to throw an exception when provided with a nonexistent + // resource. + try { + admin.setPartitionsToError(clusterName, instanceName, wrongTestResource, + Arrays.asList("1", "2")); + Assert.fail("Should throw HelixException"); + } catch (HelixException expected) { + // This exception is expected because the resource is not added. + Assert.assertEquals(expected.getMessage(), String.format( + "Can't SET_TO_ERROR state for %s.[1, 2] on %s, because resource %s is not added to cluster %s", + wrongTestResource, instanceName, wrongTestResource, clusterName)); + } + + // setPartitionsToError is expected to throw an exception when partition does not exist. + try { + admin.setPartitionsToError(clusterName, instanceName, testResource, Arrays.asList("1", "2")); + Assert.fail("Should throw HelixException"); + } catch (HelixException expected) { + // This exception is expected because partitions do not exist. + Assert.assertEquals(expected.getMessage(), String.format( + "Can't SET_TO_ERROR state for %s.[1, 2] on %s, because not all [1, 2] exist in cluster %s", + testResource, instanceName, clusterName)); + } + + // clean up + manager.disconnect(); + admin.dropCluster(clusterName); + + // verify the cluster has been removed successfully + HelixDataAccessor dataAccessor = + new ZKHelixDataAccessor(className, new ZkBaseDataAccessor<>(_gZkClient)); + try { + Assert.assertTrue(TestHelper.verify( + () -> dataAccessor.getChildNames(dataAccessor.keyBuilder().liveInstances()).isEmpty(), + 1000)); + } catch (Exception e) { + e.printStackTrace(); + System.out.println("There're live instances not cleaned up yet"); + assert false; + } + + try { + Assert.assertTrue(TestHelper.verify( + () -> dataAccessor.getChildNames(dataAccessor.keyBuilder().clusterConfig()).isEmpty(), + 1000)); + } catch (Exception e) { + e.printStackTrace(); + System.out.println("The cluster is not cleaned up yet"); + assert false; + } + } + @Test public void testResetPartition() throws Exception { String className = TestHelper.getTestClassName(); @@ -625,7 +794,7 @@ public void testResetPartition() throws Exception { } catch (HelixException expected) { // This exception is expected because the instance name is made up. Assert.assertEquals(expected.getMessage(), String.format( - "Can't reset state for %s.[1, 2] on WrongTestInstance, because %s does not exist in cluster %s", + "Can't RESET state for %s.[1, 2] on WrongTestInstance, because %s does not exist in cluster %s", testResource, wrongTestInstance, clusterName)); } @@ -636,7 +805,7 @@ public void testResetPartition() throws Exception { } catch (HelixException expected) { // This exception is expected because the instance is not alive. Assert.assertEquals(expected.getMessage(), String - .format("Can't reset state for %s.[1, 2] on %s, because %s is not alive in cluster %s", + .format("Can't RESET state for %s.[1, 2] on %s, because %s is not alive in cluster %s", testResource, instanceName, instanceName, clusterName)); } @@ -650,7 +819,7 @@ public void testResetPartition() throws Exception { } catch (HelixException expected) { // This exception is expected because the resource is not added. Assert.assertEquals(expected.getMessage(), String.format( - "Can't reset state for %s.[1, 2] on %s, because resource %s is not added to cluster %s", + "Can't RESET state for %s.[1, 2] on %s, because resource %s is not added to cluster %s", wrongTestResource, instanceName, wrongTestResource, clusterName)); } @@ -660,7 +829,7 @@ public void testResetPartition() throws Exception { } catch (HelixException expected) { // This exception is expected because partitions do not exist. Assert.assertEquals(expected.getMessage(), String.format( - "Can't reset state for %s.[1, 2] on %s, because not all [1, 2] exist in cluster %s", + "Can't RESET state for %s.[1, 2] on %s, because not all [1, 2] exist in cluster %s", testResource, instanceName, clusterName)); } diff --git a/helix-core/src/test/java/org/apache/helix/messaging/p2pMessage/TestP2PMessagesAvoidDuplicatedMessage.java b/helix-core/src/test/java/org/apache/helix/messaging/p2pMessage/TestP2PMessagesAvoidDuplicatedMessage.java index 40d5c9774d..77f4432352 100644 --- a/helix-core/src/test/java/org/apache/helix/messaging/p2pMessage/TestP2PMessagesAvoidDuplicatedMessage.java +++ b/helix-core/src/test/java/org/apache/helix/messaging/p2pMessage/TestP2PMessagesAvoidDuplicatedMessage.java @@ -82,6 +82,7 @@ private void preSetup() throws Exception { event.addAttribute(AttributeName.RESOURCES.name(), resourceMap); event.addAttribute(AttributeName.RESOURCES_TO_REBALANCE.name(), resourceMap); event.addAttribute(AttributeName.CURRENT_STATE.name(), new CurrentStateOutput()); + event.addAttribute(AttributeName.CURRENT_STATE_EXCLUDING_UNKNOWN.name(), new CurrentStateOutput()); event.addAttribute(AttributeName.helixmanager.name(), manager); _fullPipeline = new Pipeline("FullPipeline"); @@ -124,6 +125,7 @@ public void testP2PAvoidDuplicatedMessage() throws Exception { CurrentStateOutput currentStateOutput = populateCurrentStateFromBestPossible(_bestpossibleState); event.addAttribute(AttributeName.CURRENT_STATE.name(), currentStateOutput); + event.addAttribute(AttributeName.CURRENT_STATE_EXCLUDING_UNKNOWN.name(), currentStateOutput); _fullPipeline.handle(event); @@ -161,6 +163,7 @@ public void testP2PAvoidDuplicatedMessage() throws Exception { currentStateOutput.setPendingRelayMessage(_db, _partition, initialMaster, relayMessage); event.addAttribute(AttributeName.CURRENT_STATE.name(), currentStateOutput); + event.addAttribute(AttributeName.CURRENT_STATE_EXCLUDING_UNKNOWN.name(), currentStateOutput); _fullPipeline.handle(event); @@ -179,6 +182,7 @@ public void testP2PAvoidDuplicatedMessage() throws Exception { currentStateOutput.setPendingRelayMessage(_db, _partition, initialMaster, relayMessage); event.addAttribute(AttributeName.CURRENT_STATE.name(), currentStateOutput); + event.addAttribute(AttributeName.CURRENT_STATE_EXCLUDING_UNKNOWN.name(), currentStateOutput); _messagePipeline.handle(event); @@ -218,6 +222,7 @@ public void testP2PAvoidDuplicatedMessage() throws Exception { event.addAttribute(AttributeName.CURRENT_STATE.name(), currentStateOutput); + event.addAttribute(AttributeName.CURRENT_STATE_EXCLUDING_UNKNOWN.name(), currentStateOutput); event.addAttribute(AttributeName.BEST_POSSIBLE_STATE.name(), _bestpossibleState); _messagePipeline.handle(event); @@ -244,6 +249,7 @@ public void testP2PAvoidDuplicatedMessage() throws Exception { currentStateOutput.setCurrentState(_db, _partition, initialMaster, "SLAVE"); currentStateOutput.setPendingMessage(_db, _partition, secondMaster, relayMessage); event.addAttribute(AttributeName.CURRENT_STATE.name(), currentStateOutput); + event.addAttribute(AttributeName.CURRENT_STATE_EXCLUDING_UNKNOWN.name(), currentStateOutput); _fullPipeline.handle(event); @@ -264,6 +270,7 @@ public void testP2PAvoidDuplicatedMessage() throws Exception { // Validate: controller should not send S->M to thirdMaster. currentStateOutput.setCurrentState(_db, _partition, initialMaster, "OFFLINE"); event.addAttribute(AttributeName.CURRENT_STATE.name(), currentStateOutput); + event.addAttribute(AttributeName.CURRENT_STATE_EXCLUDING_UNKNOWN.name(), currentStateOutput); thirdMaster = getTopStateInstance(_bestpossibleState.getInstanceStateMap(_db, _partition), @@ -290,6 +297,7 @@ public void testP2PAvoidDuplicatedMessage() throws Exception { // Validate: Controller should not send S->M to thirdMaster. currentStateOutput.setPendingMessage(_db, _partition, secondMaster, relayMessage); event.addAttribute(AttributeName.CURRENT_STATE.name(), currentStateOutput); + event.addAttribute(AttributeName.CURRENT_STATE_EXCLUDING_UNKNOWN.name(), currentStateOutput); event.addAttribute(AttributeName.BEST_POSSIBLE_STATE.name(), _bestpossibleState); @@ -310,6 +318,7 @@ public void testP2PAvoidDuplicatedMessage() throws Exception { currentStateOutput.setCurrentState(_db, _partition, thirdMaster, "SLAVE"); event.addAttribute(AttributeName.CURRENT_STATE.name(), currentStateOutput); + event.addAttribute(AttributeName.CURRENT_STATE_EXCLUDING_UNKNOWN.name(), currentStateOutput); _messagePipeline.handle(event); diff --git a/helix-core/src/test/java/org/apache/helix/messaging/p2pMessage/TestP2PStateTransitionMessages.java b/helix-core/src/test/java/org/apache/helix/messaging/p2pMessage/TestP2PStateTransitionMessages.java index 307022f3fb..9a38656953 100644 --- a/helix-core/src/test/java/org/apache/helix/messaging/p2pMessage/TestP2PStateTransitionMessages.java +++ b/helix-core/src/test/java/org/apache/helix/messaging/p2pMessage/TestP2PStateTransitionMessages.java @@ -95,6 +95,7 @@ public void testAvoidDuplicatedMessageWithP2PEnabled() throws Exception { event.addAttribute(AttributeName.RESOURCES.name(), resourceMap); event.addAttribute(AttributeName.RESOURCES_TO_REBALANCE.name(), resourceMap); event.addAttribute(AttributeName.CURRENT_STATE.name(), new CurrentStateOutput()); + event.addAttribute(AttributeName.CURRENT_STATE_EXCLUDING_UNKNOWN.name(), new CurrentStateOutput()); event.addAttribute(AttributeName.helixmanager.name(), manager); Pipeline pipeline = createPipeline(); @@ -106,6 +107,7 @@ public void testAvoidDuplicatedMessageWithP2PEnabled() throws Exception { CurrentStateOutput currentStateOutput = populateCurrentStateFromBestPossible(bestPossibleStateOutput); event.addAttribute(AttributeName.CURRENT_STATE.name(), currentStateOutput); + event.addAttribute(AttributeName.CURRENT_STATE_EXCLUDING_UNKNOWN.name(), currentStateOutput); Partition p = new Partition(db + "_0"); @@ -153,6 +155,7 @@ public void testAvoidDuplicatedMessageWithP2PEnabled() throws Exception { currentStateOutput.setPendingRelayMessage(db, p, masterInstance, relayMessage); event.addAttribute(AttributeName.CURRENT_STATE.name(), currentStateOutput); + event.addAttribute(AttributeName.CURRENT_STATE_EXCLUDING_UNKNOWN.name(), currentStateOutput); pipeline.handle(event); @@ -167,6 +170,7 @@ public void testAvoidDuplicatedMessageWithP2PEnabled() throws Exception { currentStateOutput.setCurrentState(db, p, masterInstance, "SLAVE"); currentStateOutput.setPendingMessage(db, p, newMasterInstance, relayMessage); event.addAttribute(AttributeName.CURRENT_STATE.name(), currentStateOutput); + event.addAttribute(AttributeName.CURRENT_STATE_EXCLUDING_UNKNOWN.name(), currentStateOutput); pipeline.handle(event); @@ -186,6 +190,7 @@ public void testAvoidDuplicatedMessageWithP2PEnabled() throws Exception { // but controller should not send S->M to newly calculated master. currentStateOutput.setCurrentState(db, p, masterInstance, "OFFLINE"); event.addAttribute(AttributeName.CURRENT_STATE.name(), currentStateOutput); + event.addAttribute(AttributeName.CURRENT_STATE_EXCLUDING_UNKNOWN.name(), currentStateOutput); String slaveInstance = getTopStateInstance(bestPossibleStateOutput.getInstanceStateMap(db, p), @@ -217,6 +222,7 @@ public void testAvoidDuplicatedMessageWithP2PEnabled() throws Exception { // Controller will not send S->M to new master. currentStateOutput.setPendingMessage(db, p, newMasterInstance, relayMessage); event.addAttribute(AttributeName.CURRENT_STATE.name(), currentStateOutput); + event.addAttribute(AttributeName.CURRENT_STATE_EXCLUDING_UNKNOWN.name(), currentStateOutput); event.addAttribute(AttributeName.BEST_POSSIBLE_STATE.name(), bestPossibleStateOutput); event.addAttribute(AttributeName.INTERMEDIATE_STATE.name(), bestPossibleStateOutput); @@ -244,6 +250,7 @@ public void testAvoidDuplicatedMessageWithP2PEnabled() throws Exception { currentStateOutput.setCurrentState(db, p, slaveInstance, "SLAVE"); event.addAttribute(AttributeName.CURRENT_STATE.name(), currentStateOutput); + event.addAttribute(AttributeName.CURRENT_STATE_EXCLUDING_UNKNOWN.name(), currentStateOutput); pipeline = new Pipeline("test"); pipeline.addStage(new MessageGenerationPhase()); @@ -271,6 +278,7 @@ private void testP2PMessage(ClusterConfig clusterConfig, Boolean p2pMessageEnabl event.addAttribute(AttributeName.RESOURCES.name(), resourceMap); event.addAttribute(AttributeName.RESOURCES_TO_REBALANCE.name(), resourceMap); event.addAttribute(AttributeName.CURRENT_STATE.name(), new CurrentStateOutput()); + event.addAttribute(AttributeName.CURRENT_STATE_EXCLUDING_UNKNOWN.name(), new CurrentStateOutput()); event.addAttribute(AttributeName.helixmanager.name(), manager); event.addAttribute(AttributeName.ControllerDataProvider.name(), new ResourceControllerDataProvider()); @@ -284,6 +292,7 @@ private void testP2PMessage(ClusterConfig clusterConfig, Boolean p2pMessageEnabl CurrentStateOutput currentStateOutput = populateCurrentStateFromBestPossible(bestPossibleStateOutput); event.addAttribute(AttributeName.CURRENT_STATE.name(), currentStateOutput); + event.addAttribute(AttributeName.CURRENT_STATE_EXCLUDING_UNKNOWN.name(), currentStateOutput); Partition p = new Partition(db + "_0"); diff --git a/helix-core/src/test/java/org/apache/helix/messaging/p2pMessage/TestP2PWithStateCancellationMessage.java b/helix-core/src/test/java/org/apache/helix/messaging/p2pMessage/TestP2PWithStateCancellationMessage.java index ea2a4aa38d..9f60a4a7fb 100644 --- a/helix-core/src/test/java/org/apache/helix/messaging/p2pMessage/TestP2PWithStateCancellationMessage.java +++ b/helix-core/src/test/java/org/apache/helix/messaging/p2pMessage/TestP2PWithStateCancellationMessage.java @@ -165,6 +165,7 @@ private ClusterEvent generateClusterEvent() { currentStateOutput.setCurrentState(RESOURCE_NAME, new Partition("0"), "localhost_2", "SLAVE"); currentStateOutput.setCurrentState(RESOURCE_NAME, new Partition("1"), "localhost_2", "MASTER"); event.addAttribute(AttributeName.CURRENT_STATE.name(), currentStateOutput); + event.addAttribute(AttributeName.CURRENT_STATE_EXCLUDING_UNKNOWN.name(), currentStateOutput); BestPossibleStateOutput bestPossibleStateOutput = new BestPossibleStateOutput(); bestPossibleStateOutput.setState(RESOURCE_NAME, new Partition("0"), "localhost_1", "SLAVE"); diff --git a/helix-core/src/test/java/org/apache/helix/mock/MockHelixAdmin.java b/helix-core/src/test/java/org/apache/helix/mock/MockHelixAdmin.java index 60ee9b6257..c5c5626ff6 100644 --- a/helix-core/src/test/java/org/apache/helix/mock/MockHelixAdmin.java +++ b/helix-core/src/test/java/org/apache/helix/mock/MockHelixAdmin.java @@ -24,6 +24,8 @@ import java.util.List; import java.util.Map; +import javax.annotation.Nullable; + import org.apache.helix.BaseDataAccessor; import org.apache.helix.HelixAdmin; import org.apache.helix.HelixDataAccessor; @@ -266,11 +268,13 @@ public void removeFromIdealState(String clusterName, String resourceName, IdealS } + @Deprecated @Override public void enableInstance(String clusterName, String instanceName, boolean enabled) { enableInstance(clusterName, instanceName, enabled, null, null); } + @Deprecated @Override public void enableInstance(String clusterName, String instanceName, boolean enabled, InstanceConstants.InstanceDisabledType disabledType, String reason) { @@ -283,19 +287,17 @@ public void enableInstance(String clusterName, String instanceName, boolean enab ZNRecord record = (ZNRecord) _baseDataAccessor.get(instanceConfigPath, null, 0); InstanceConfig instanceConfig = new InstanceConfig(record); - instanceConfig.setInstanceEnabled(enabled); + instanceConfig.setInstanceOperation(new InstanceConfig.InstanceOperation.Builder().setOperation( + enabled ? InstanceConstants.InstanceOperation.ENABLE + : InstanceConstants.InstanceOperation.DISABLE).setReason(reason).build()); if (!enabled) { + // TODO: Replace this when the HELIX_ENABLED and HELIX_DISABLED fields are removed. instanceConfig.resetInstanceDisabledTypeAndReason(); - if (reason != null) { - instanceConfig.setInstanceDisabledReason(reason); - } - if (disabledType != null) { - instanceConfig.setInstanceDisabledType(disabledType); - } } _baseDataAccessor.set(instanceConfigPath, instanceConfig.getRecord(), 0); } + @Deprecated @Override public void enableInstance(String clusterName, List instances, boolean enabled) { @@ -303,7 +305,20 @@ public void enableInstance(String clusterName, List instances, boolean e @Override public void setInstanceOperation(String clusterName, String instanceName, - InstanceConstants.InstanceOperation instanceOperation) { + @Nullable InstanceConstants.InstanceOperation instanceOperation) { + setInstanceOperation(clusterName, instanceName, instanceOperation, null, false); + } + + @Override + public void setInstanceOperation(String clusterName, String instanceName, + @Nullable InstanceConstants.InstanceOperation instanceOperation, String reason) { + setInstanceOperation(clusterName, instanceName, instanceOperation, reason, false); + } + + @Override + public void setInstanceOperation(String clusterName, String instanceName, + @Nullable InstanceConstants.InstanceOperation instanceOperation, String reason, + boolean overrideAll) { } @Override @@ -360,6 +375,12 @@ public ClusterManagementMode getClusterManagementMode(String clusterName) { return null; } + @Override + public void setPartitionsToError(String clusterName, String instanceName, String resourceName, + List partitionNames) { + + } + @Override public void resetPartition(String clusterName, String instanceName, String resourceName, List partitionNames) { diff --git a/helix-core/src/test/java/org/apache/helix/model/TestInstanceConfig.java b/helix-core/src/test/java/org/apache/helix/model/TestInstanceConfig.java index b8e6569f5d..47ea88ac4d 100644 --- a/helix-core/src/test/java/org/apache/helix/model/TestInstanceConfig.java +++ b/helix-core/src/test/java/org/apache/helix/model/TestInstanceConfig.java @@ -51,8 +51,7 @@ public void testGetParsedDomain() { @Test public void testSetInstanceEnableWithReason() { InstanceConfig instanceConfig = new InstanceConfig(new ZNRecord("id")); - instanceConfig.setInstanceEnabled(true); - instanceConfig.setInstanceDisabledReason("NoShowReason"); + instanceConfig.setInstanceOperation(InstanceConstants.InstanceOperation.ENABLE); instanceConfig.setInstanceDisabledType(InstanceConstants.InstanceDisabledType.USER_OPERATION); Assert.assertEquals(instanceConfig.getRecord().getSimpleFields() @@ -62,10 +61,9 @@ public void testSetInstanceEnableWithReason() { Assert.assertEquals(instanceConfig.getRecord().getSimpleFields() .get(InstanceConfig.InstanceConfigProperty.HELIX_DISABLED_TYPE.toString()), null); - - instanceConfig.setInstanceEnabled(false); String reasonCode = "ReasonCode"; - instanceConfig.setInstanceDisabledReason(reasonCode); + instanceConfig.setInstanceOperation(new InstanceConfig.InstanceOperation.Builder().setOperation( + InstanceConstants.InstanceOperation.DISABLE).setReason(reasonCode).build()); instanceConfig.setInstanceDisabledType(InstanceConstants.InstanceDisabledType.USER_OPERATION); Assert.assertEquals(instanceConfig.getRecord().getSimpleFields() .get(InstanceConfig.InstanceConfigProperty.HELIX_ENABLED.toString()), "false"); @@ -197,4 +195,151 @@ public void testInstanceConfigBuilder() { Assert.assertEquals(instanceConfig.getInstanceInfoMap().get("CABINET"), "30"); Assert.assertEquals(instanceConfig.getInstanceCapacityMap().get("weight1"), Integer.valueOf(1)); } + + @Test + public void testInstanceOperationReason() { + InstanceConfig instanceConfig = new InstanceConfig("instance1"); + instanceConfig.setInstanceEnabled(false); + instanceConfig.setInstanceDisabledReason("disableReason"); + Assert.assertEquals(instanceConfig.getInstanceDisabledReason(), "disableReason"); + Assert.assertEquals(instanceConfig.getInstanceDisabledReason(), "disableReason"); + + instanceConfig.setInstanceOperation(new InstanceConfig.InstanceOperation.Builder().setOperation( + InstanceConstants.InstanceOperation.UNKNOWN).setReason("unknownReason").build()); + Assert.assertEquals(instanceConfig.getInstanceDisabledReason(), "disableReason"); + Assert.assertEquals(instanceConfig.getInstanceOperation().getReason(), "unknownReason"); + + instanceConfig.setInstanceOperation(InstanceConstants.InstanceOperation.DISABLE); + instanceConfig.setInstanceOperation(new InstanceConfig.InstanceOperation.Builder().setOperation( + InstanceConstants.InstanceOperation.DISABLE).setReason("disableReason2").build()); + Assert.assertEquals(instanceConfig.getInstanceDisabledReason(), "disableReason2"); + Assert.assertEquals(instanceConfig.getInstanceOperation().getReason(), "disableReason2"); + + instanceConfig.setInstanceOperation(InstanceConstants.InstanceOperation.ENABLE); + Assert.assertEquals(instanceConfig.getInstanceDisabledReason(), ""); + Assert.assertEquals(instanceConfig.getInstanceOperation().getReason(), ""); + } + + @Test + public void testOverwriteInstanceConfig() { + InstanceConfig instanceConfig = new InstanceConfig("instance2"); + instanceConfig.setHostName("host1"); + instanceConfig.setPort("1234"); + instanceConfig.setDomain("foo=bar"); + instanceConfig.setWeight(100); + instanceConfig.setInstanceEnabled(false); + instanceConfig.addTag("tag1"); + instanceConfig.addTag("tag2"); + instanceConfig.setInstanceCapacityMap(ImmutableMap.of("weight1", 1)); + + InstanceConfig overrideConfig = new InstanceConfig("instance1"); + overrideConfig.setHostName("host2"); + overrideConfig.setPort("5678"); + overrideConfig.setDomain("foo=bar2"); + overrideConfig.setWeight(200); + overrideConfig.addTag("tag3"); + overrideConfig.addTag("tag4"); + overrideConfig.setInstanceOperation(InstanceConstants.InstanceOperation.EVACUATE); + overrideConfig.setInstanceCapacityMap(ImmutableMap.of("weight2", 2)); + + instanceConfig.overwriteInstanceConfig(overrideConfig); + + Assert.assertEquals(instanceConfig.getId(), "instance2"); + Assert.assertEquals(instanceConfig.getHostName(), "host1"); + Assert.assertEquals(instanceConfig.getPort(), "1234"); + Assert.assertEquals(instanceConfig.getDomainAsString(), "foo=bar"); + Assert.assertEquals(instanceConfig.getWeight(), 200); + Assert.assertFalse(instanceConfig.getTags().contains("tag1")); + Assert.assertFalse(instanceConfig.getTags().contains("tag2")); + Assert.assertTrue(instanceConfig.getTags().contains("tag3")); + Assert.assertTrue(instanceConfig.getTags().contains("tag4")); + Assert.assertFalse(instanceConfig.getRecord().getSimpleFields() + .containsKey(InstanceConfig.InstanceConfigProperty.HELIX_ENABLED.toString())); + Assert.assertEquals(instanceConfig.getInstanceOperation().getOperation(), + InstanceConstants.InstanceOperation.EVACUATE); + Assert.assertFalse(instanceConfig.getInstanceCapacityMap().containsKey("weight1")); + Assert.assertEquals(instanceConfig.getInstanceCapacityMap().get("weight2"), Integer.valueOf(2)); + } + + @Test + public void testInstanceOperationMultipleSources() throws InterruptedException { + InstanceConfig instanceConfig = new InstanceConfig("instance1"); + + // Check that the instance operation is ENABLE from the DEFAULT source + Assert.assertEquals(instanceConfig.getInstanceOperation().getOperation(), + InstanceConstants.InstanceOperation.ENABLE); + Assert.assertEquals(instanceConfig.getInstanceOperation().getSource(), + InstanceConstants.InstanceOperationSource.DEFAULT); + + // Set instance operation from user source + instanceConfig.setInstanceOperation(new InstanceConfig.InstanceOperation.Builder().setOperation( + InstanceConstants.InstanceOperation.DISABLE).setReason("userReason") + .setSource(InstanceConstants.InstanceOperationSource.USER).build()); + // Get enabled time + long op1EnabledTime = instanceConfig.getInstanceEnabledTime(); + + Thread.sleep(1000); + // Set instance operation from automation source + instanceConfig.setInstanceOperation(new InstanceConfig.InstanceOperation.Builder().setOperation( + InstanceConstants.InstanceOperation.DISABLE).setReason("automationReason") + .setSource(InstanceConstants.InstanceOperationSource.AUTOMATION).build()); + + // Check that the enabled time is the same as op1 but the source and reason is changed to automation + Assert.assertEquals(instanceConfig.getInstanceEnabledTime(), op1EnabledTime); + Assert.assertEquals(instanceConfig.getInstanceOperation().getSource(), + InstanceConstants.InstanceOperationSource.AUTOMATION); + + Thread.sleep(1000); + // Set instance operation from user source to be ENABLE + instanceConfig.setInstanceOperation(new InstanceConfig.InstanceOperation.Builder().setOperation( + InstanceConstants.InstanceOperation.ENABLE) + .setSource(InstanceConstants.InstanceOperationSource.USER).build()); + + // Check that the operation is DISABLE, the enabled time is the same as op1, and the source is still automation + Assert.assertEquals(instanceConfig.getInstanceOperation().getOperation(), + InstanceConstants.InstanceOperation.DISABLE); + Assert.assertEquals(instanceConfig.getInstanceEnabledTime(), op1EnabledTime); + Assert.assertEquals(instanceConfig.getInstanceOperation().getSource(), + InstanceConstants.InstanceOperationSource.AUTOMATION); + + Thread.sleep(1000); + // Set the instance operation from the automation source to be ENABLE + instanceConfig.setInstanceOperation(new InstanceConfig.InstanceOperation.Builder().setOperation( + InstanceConstants.InstanceOperation.ENABLE) + .setSource(InstanceConstants.InstanceOperationSource.AUTOMATION).build()); + + // Check that the operation is ENABLE, the enabled time is the different from op1, and the source is still automation + Assert.assertEquals(instanceConfig.getInstanceOperation().getOperation(), + InstanceConstants.InstanceOperation.ENABLE); + Assert.assertFalse(instanceConfig.getInstanceEnabledTime() == op1EnabledTime); + Assert.assertEquals(instanceConfig.getInstanceOperation().getSource(), + InstanceConstants.InstanceOperationSource.AUTOMATION); + + // Set the instance operation from the automation source to be EVACUATE + instanceConfig.setInstanceOperation(new InstanceConfig.InstanceOperation.Builder().setOperation( + InstanceConstants.InstanceOperation.EVACUATE) + .setSource(InstanceConstants.InstanceOperationSource.AUTOMATION).build()); + + // Set the instance operation from the user source to be DISABLE + instanceConfig.setInstanceOperation(new InstanceConfig.InstanceOperation.Builder().setOperation( + InstanceConstants.InstanceOperation.DISABLE) + .setSource(InstanceConstants.InstanceOperationSource.USER).build()); + + // Check that the instance operation is DISABLE and the source is user + Assert.assertEquals(instanceConfig.getInstanceOperation().getOperation(), + InstanceConstants.InstanceOperation.DISABLE); + Assert.assertEquals(instanceConfig.getInstanceOperation().getSource(), + InstanceConstants.InstanceOperationSource.USER); + + // Set the instance operation from the admin source to be ENABLE + instanceConfig.setInstanceOperation(new InstanceConfig.InstanceOperation.Builder().setOperation( + InstanceConstants.InstanceOperation.ENABLE) + .setSource(InstanceConstants.InstanceOperationSource.ADMIN).build()); + + // Check that the instance operation is ENABLE and the source is admin + Assert.assertEquals(instanceConfig.getInstanceOperation().getOperation(), + InstanceConstants.InstanceOperation.ENABLE); + Assert.assertEquals(instanceConfig.getInstanceOperation().getSource(), + InstanceConstants.InstanceOperationSource.ADMIN); + } } diff --git a/helix-core/src/test/java/org/apache/helix/monitoring/mbeans/TestRebalancerMetrics.java b/helix-core/src/test/java/org/apache/helix/monitoring/mbeans/TestRebalancerMetrics.java index 57a3ad0bc8..b02a0f41d4 100644 --- a/helix-core/src/test/java/org/apache/helix/monitoring/mbeans/TestRebalancerMetrics.java +++ b/helix-core/src/test/java/org/apache/helix/monitoring/mbeans/TestRebalancerMetrics.java @@ -69,6 +69,7 @@ public void testRecoveryRebalanceMetrics() { event.addAttribute(AttributeName.RESOURCES.name(), resourceMap); event.addAttribute(AttributeName.RESOURCES_TO_REBALANCE.name(), resourceMap); event.addAttribute(AttributeName.CURRENT_STATE.name(), currentStateOutput); + event.addAttribute(AttributeName.CURRENT_STATE_EXCLUDING_UNKNOWN.name(), currentStateOutput); event.addAttribute(AttributeName.ControllerDataProvider.name(), new ResourceControllerDataProvider()); ClusterStatusMonitor monitor = new ClusterStatusMonitor(_clusterName); monitor.active(); @@ -119,6 +120,7 @@ public void testLoadBalanceMetrics() { event.addAttribute(AttributeName.RESOURCES.name(), resourceMap); event.addAttribute(AttributeName.RESOURCES_TO_REBALANCE.name(), resourceMap); event.addAttribute(AttributeName.CURRENT_STATE.name(), currentStateOutput); + event.addAttribute(AttributeName.CURRENT_STATE_EXCLUDING_UNKNOWN.name(), currentStateOutput); event.addAttribute(AttributeName.ControllerDataProvider.name(), new ResourceControllerDataProvider()); ClusterStatusMonitor monitor = new ClusterStatusMonitor(_clusterName); monitor.active(); @@ -131,6 +133,7 @@ public void testLoadBalanceMetrics() { event.getAttribute(AttributeName.BEST_POSSIBLE_STATE.name()); currentStateOutput = copyCurrentStateFromBestPossible(bestPossibleStateOutput, resource); event.addAttribute(AttributeName.CURRENT_STATE.name(), currentStateOutput); + event.addAttribute(AttributeName.CURRENT_STATE_EXCLUDING_UNKNOWN.name(), currentStateOutput); setupLiveInstances(4); diff --git a/helix-core/src/test/java/org/apache/helix/monitoring/mbeans/TestResourceMonitor.java b/helix-core/src/test/java/org/apache/helix/monitoring/mbeans/TestResourceMonitor.java index 0fb0e09371..355cad4501 100644 --- a/helix-core/src/test/java/org/apache/helix/monitoring/mbeans/TestResourceMonitor.java +++ b/helix-core/src/test/java/org/apache/helix/monitoring/mbeans/TestResourceMonitor.java @@ -37,6 +37,7 @@ import com.google.common.collect.ImmutableMap; import org.apache.helix.TestHelper; +import org.apache.helix.model.ResourceConfig; import org.apache.helix.zookeeper.datamodel.ZNRecord; import org.apache.helix.model.BuiltInStateModelDefinitions; import org.apache.helix.model.ExternalView; @@ -220,6 +221,26 @@ public void testReportData() throws JMException { monitor.setRebalanceState(ResourceMonitor.RebalanceStatus.INTERMEDIATE_STATE_CAL_FAILED); Assert.assertEquals(monitor.getRebalanceState(), ResourceMonitor.RebalanceStatus.INTERMEDIATE_STATE_CAL_FAILED.name()); + + // test when replica is set to ANY_LIVEINSTANCE and all instances are taken offline. + idealState.setReplicas(ResourceConfig.ResourceConfigConstants.ANY_LIVEINSTANCE.name()); + + for (int i = 0; i < _partitions; i++) { + String partition = _dbName + "_" + i; + Map externalViewStateMap = externalView.getStateMap(partition); + for (String key : externalViewStateMap.keySet()) { + if (externalViewStateMap.get(key).equalsIgnoreCase("MASTER")) { + externalViewStateMap.put(key, "OFFLINE"); + } + } + externalView.setStateMap(partition, externalViewStateMap); + } + + monitor.updateResourceState(externalView, idealState, stateModelDef); + + Assert.assertEquals(monitor.getMissingTopStatePartitionGauge(), _partitions); + Assert.assertEquals(monitor.getMissingReplicaPartitionGauge(), 0); + Assert.assertEquals(monitor.getMissingMinActiveReplicaPartitionGauge(), 0); } finally { // Has to unregister this monitor to clean up. Otherwise, later tests may be affected and fail. monitor.unregister(); diff --git a/helix-core/src/test/java/org/apache/helix/task/TestTargetedTaskStateChange.java b/helix-core/src/test/java/org/apache/helix/task/TestTargetedTaskStateChange.java index d8810b153d..24113c9bed 100644 --- a/helix-core/src/test/java/org/apache/helix/task/TestTargetedTaskStateChange.java +++ b/helix-core/src/test/java/org/apache/helix/task/TestTargetedTaskStateChange.java @@ -86,7 +86,7 @@ public void testTwoRunningCurrentStates() { when(mock._cache.getTaskDataCache()).thenReturn(mock._taskDataCache); when(mock._cache.getJobContext(JOB_NAME)).thenReturn(mock._jobContext); when(mock._cache.getIdealStates()).thenReturn(mock._idealStates); - when(mock._cache.getAssignableEnabledLiveInstances()).thenReturn(_liveInstances.keySet()); + when(mock._cache.getEnabledLiveInstances()).thenReturn(_liveInstances.keySet()); when(mock._cache.getAssignableInstanceConfigMap()).thenReturn(_instanceConfigs); when(mock._cache.getClusterConfig()).thenReturn(_clusterConfig); when(mock._taskDataCache.getRuntimeJobDag(WORKFLOW_NAME)).thenReturn(mock._runtimeJobDag); @@ -123,7 +123,7 @@ public void testOneRunningOneNull() { when(mock._cache.getTaskDataCache()).thenReturn(mock._taskDataCache); when(mock._cache.getJobContext(JOB_NAME)).thenReturn(mock._jobContext); when(mock._cache.getIdealStates()).thenReturn(mock._idealStates); - when(mock._cache.getAssignableEnabledLiveInstances()).thenReturn(_liveInstances.keySet()); + when(mock._cache.getEnabledLiveInstances()).thenReturn(_liveInstances.keySet()); when(mock._cache.getAssignableInstanceConfigMap()).thenReturn(_instanceConfigs); when(mock._cache.getClusterConfig()).thenReturn(_clusterConfig); when(mock._taskDataCache.getRuntimeJobDag(WORKFLOW_NAME)).thenReturn(mock._runtimeJobDag); diff --git a/helix-core/src/test/java/org/apache/helix/util/TestIdealStateAssignment.java b/helix-core/src/test/java/org/apache/helix/util/TestIdealStateAssignment.java index 6a7c8ba532..7d2a1b36fb 100644 --- a/helix-core/src/test/java/org/apache/helix/util/TestIdealStateAssignment.java +++ b/helix-core/src/test/java/org/apache/helix/util/TestIdealStateAssignment.java @@ -23,6 +23,7 @@ import java.util.List; import java.util.Map; +import org.apache.helix.constants.InstanceConstants; import org.apache.helix.model.ClusterConfig; import org.apache.helix.model.IdealState; import org.apache.helix.model.InstanceConfig; @@ -43,7 +44,8 @@ public void testIdealStateAssignment(String clusterName, List instances, for (String instance : instances) { instanceConfigs.add(new InstanceConfig(instance)); if (disabledInstances.contains(instance)) { - instanceConfigs.get(instanceConfigs.size() - 1).setInstanceEnabled(false); + instanceConfigs.get(instanceConfigs.size() - 1) + .setInstanceOperation(InstanceConstants.InstanceOperation.DISABLE); } } diff --git a/helix-core/src/test/java/org/apache/helix/util/TestInstanceValidationUtil.java b/helix-core/src/test/java/org/apache/helix/util/TestInstanceValidationUtil.java index 79b0fdce81..88dd053514 100644 --- a/helix-core/src/test/java/org/apache/helix/util/TestInstanceValidationUtil.java +++ b/helix-core/src/test/java/org/apache/helix/util/TestInstanceValidationUtil.java @@ -33,6 +33,7 @@ import org.apache.helix.HelixException; import org.apache.helix.PropertyKey; import org.apache.helix.PropertyType; +import org.apache.helix.constants.InstanceConstants; import org.apache.helix.model.ClusterConfig; import org.apache.helix.model.CurrentState; import org.apache.helix.model.ExternalView; @@ -77,7 +78,9 @@ public void TestIsInstanceEnabled(boolean instanceConfigEnabled, boolean cluster boolean expected) { Mock mock = new Mock(); InstanceConfig instanceConfig = new InstanceConfig(TEST_INSTANCE); - instanceConfig.setInstanceEnabled(instanceConfigEnabled); + instanceConfig.setInstanceOperation( + instanceConfigEnabled ? InstanceConstants.InstanceOperation.ENABLE + : InstanceConstants.InstanceOperation.DISABLE); doReturn(instanceConfig).when(mock.dataAccessor) .getProperty(BUILDER.instanceConfig(TEST_INSTANCE)); ClusterConfig clusterConfig = new ClusterConfig(TEST_CLUSTER); @@ -101,17 +104,6 @@ public void TestIsInstanceEnabled_whenInstanceConfigNull() { InstanceValidationUtil.isEnabled(mock.dataAccessor, TEST_INSTANCE); } - @Test(expectedExceptions = HelixException.class) - public void TestIsInstanceEnabled_whenClusterConfigNull() { - Mock mock = new Mock(); - doReturn(new InstanceConfig(TEST_INSTANCE)).when(mock.dataAccessor) - .getProperty(argThat(new PropertyKeyArgument(PropertyType.CONFIGS))); - doReturn(null).when(mock.dataAccessor) - .getProperty(BUILDER.clusterConfig()); - - InstanceValidationUtil.isEnabled(mock.dataAccessor, TEST_INSTANCE); - } - @Test public void TestIsInstanceAlive() { Mock mock = new Mock(); diff --git a/helix-core/src/test/resources/TestAbstractRebalancer.ComputeBestPossibleState.json b/helix-core/src/test/resources/TestAbstractRebalancer.ComputeBestPossibleState.json index 365994257d..30553e0fda 100644 --- a/helix-core/src/test/resources/TestAbstractRebalancer.ComputeBestPossibleState.json +++ b/helix-core/src/test/resources/TestAbstractRebalancer.ComputeBestPossibleState.json @@ -94,7 +94,8 @@ "node_3" ], "expectedBestPossibleStateMap": { - "node_1": "OFFLINE" + "node_1": "OFFLINE", + "node_3": "ERROR" } }, { diff --git a/helix-front/yarn.lock b/helix-front/yarn.lock index 57efb41424..4f31486519 100644 --- a/helix-front/yarn.lock +++ b/helix-front/yarn.lock @@ -11567,6 +11567,11 @@ minipass@^3.0.0, minipass@^3.1.0, minipass@^3.1.1, minipass@^3.1.3, minipass@^3. dependencies: yallist "^4.0.0" +minipass@^5.0.0: + version "5.0.0" + resolved "https://registry.yarnpkg.com/minipass/-/minipass-5.0.0.tgz#3e9788ffb90b694a5d0ec94479a45b5d8738133d" + integrity sha512-3FnjYuehv9k6ovOEbyOswadCDPX1piCfhV8ncmYtHOjuPwylVWsghTLo7rabjC3Rx5xD4HDx8Wm1xnMF7S5qFQ== + minizlib@^2.0.0, minizlib@^2.1.1, minizlib@^2.1.2: version "2.1.2" resolved "https://registry.yarnpkg.com/minizlib/-/minizlib-2.1.2.tgz#e90d3466ba209b932451508a11ce3d3632145931" @@ -14895,13 +14900,13 @@ tapable@^2.0.0, tapable@^2.1.1, tapable@^2.2.0: integrity sha512-GNzQvQTOIP6RyTfE2Qxb8ZVlNmw0n88vp1szwWRimP02mnTsx3Wtn5qRdqY9w2XduFNUgvOwhNnQsjwCp+kqaQ== tar@^6.0.2, tar@^6.1.0, tar@^6.1.11, tar@^6.1.2: - version "6.1.11" - resolved "https://registry.yarnpkg.com/tar/-/tar-6.1.11.tgz#6760a38f003afa1b2ffd0ffe9e9abbd0eab3d621" - integrity sha512-an/KZQzQUkZCkuoAA64hM92X0Urb6VpRhAFllDzz44U2mcD5scmT3zBc4VgVpkugF580+DQn8eAFSyoQt0tznA== + version "6.2.1" + resolved "https://registry.yarnpkg.com/tar/-/tar-6.2.1.tgz#717549c541bc3c2af15751bea94b1dd068d4b03a" + integrity sha512-DZ4yORTwrbTj/7MZYq2w+/ZFdI6OZ/f9SFHR+71gIVUZhOQPHzVCLpvRnPgyaMpfWxxk/4ONva3GQSyNIKRv6A== dependencies: chownr "^2.0.0" fs-minipass "^2.0.0" - minipass "^3.0.0" + minipass "^5.0.0" minizlib "^2.1.1" mkdirp "^1.0.3" yallist "^4.0.0" diff --git a/helix-rest/src/main/java/org/apache/helix/rest/clusterMaintenanceService/StoppableInstancesSelector.java b/helix-rest/src/main/java/org/apache/helix/rest/clusterMaintenanceService/StoppableInstancesSelector.java index 877aaa9c89..bb5a2bc5c4 100644 --- a/helix-rest/src/main/java/org/apache/helix/rest/clusterMaintenanceService/StoppableInstancesSelector.java +++ b/helix-rest/src/main/java/org/apache/helix/rest/clusterMaintenanceService/StoppableInstancesSelector.java @@ -268,8 +268,8 @@ private void collectEvacuatingInstances(Set toBeStoppedInstances) { PropertyKey.Builder propertyKeyBuilder = _dataAccessor.keyBuilder(); InstanceConfig instanceConfig = _dataAccessor.getProperty(propertyKeyBuilder.instanceConfig(instance)); - if (InstanceConstants.InstanceOperation.EVACUATE.name() - .equals(instanceConfig.getInstanceOperation())) { + if (InstanceConstants.InstanceOperation.EVACUATE.equals( + instanceConfig.getInstanceOperation().getOperation())) { toBeStoppedInstances.add(instance); } } diff --git a/helix-rest/src/main/java/org/apache/helix/rest/server/resources/AbstractResource.java b/helix-rest/src/main/java/org/apache/helix/rest/server/resources/AbstractResource.java index ce3d27273e..fdad634afd 100644 --- a/helix-rest/src/main/java/org/apache/helix/rest/server/resources/AbstractResource.java +++ b/helix-rest/src/main/java/org/apache/helix/rest/server/resources/AbstractResource.java @@ -89,7 +89,8 @@ public enum Command { canCompleteSwap, completeSwapIfPossible, onDemandRebalance, - isEvacuateFinished + isEvacuateFinished, + setPartitionsToError } @Context diff --git a/helix-rest/src/main/java/org/apache/helix/rest/server/resources/helix/PerInstanceAccessor.java b/helix-rest/src/main/java/org/apache/helix/rest/server/resources/helix/PerInstanceAccessor.java index efeeee7f7e..55fc4de36e 100644 --- a/helix-rest/src/main/java/org/apache/helix/rest/server/resources/helix/PerInstanceAccessor.java +++ b/helix-rest/src/main/java/org/apache/helix/rest/server/resources/helix/PerInstanceAccessor.java @@ -45,12 +45,14 @@ import com.fasterxml.jackson.databind.node.JsonNodeFactory; import com.fasterxml.jackson.databind.node.ObjectNode; import com.google.common.collect.ImmutableMap; +import org.apache.helix.BaseDataAccessor; import org.apache.helix.ConfigAccessor; import org.apache.helix.HelixAdmin; import org.apache.helix.HelixDataAccessor; import org.apache.helix.HelixException; import org.apache.helix.constants.InstanceConstants; import org.apache.helix.manager.zk.ZKHelixDataAccessor; +import org.apache.helix.manager.zk.ZkBaseDataAccessor; import org.apache.helix.model.CurrentState; import org.apache.helix.model.Error; import org.apache.helix.model.HealthStat; @@ -66,6 +68,7 @@ import org.apache.helix.rest.server.filters.ClusterAuth; import org.apache.helix.rest.server.json.instance.InstanceInfo; import org.apache.helix.rest.server.json.instance.StoppableCheck; +import org.apache.helix.util.InstanceUtil; import org.apache.helix.zookeeper.datamodel.ZNRecord; import org.eclipse.jetty.util.StringUtil; import org.slf4j.Logger; @@ -388,9 +391,11 @@ record = toZNRecord(content); @POST public Response updateInstance(@PathParam("clusterId") String clusterId, @PathParam("instanceName") String instanceName, @QueryParam("command") String command, - @QueryParam("instanceOperation") InstanceConstants.InstanceOperation state, - @QueryParam("instanceDisabledType") String disabledType, - @QueryParam("instanceDisabledReason") String disabledReason, + @QueryParam("instanceOperation") InstanceConstants.InstanceOperation instanceOperation, + @QueryParam("instanceOperationSource") InstanceConstants.InstanceOperationSource instanceOperationSource, + @QueryParam("reason") String reason, + @Deprecated @QueryParam("instanceDisabledType") String disabledType, + @Deprecated @QueryParam("instanceDisabledReason") String disabledReason, @QueryParam("force") boolean force, String content) { Command cmd; try { @@ -434,8 +439,23 @@ public Response updateInstance(@PathParam("clusterId") String clusterId, OBJECT_MAPPER.getTypeFactory() .constructCollectionType(List.class, String.class))); break; + case setPartitionsToError: + if (!validInstance(node, instanceName)) { + return badRequest("Instance names are not a match!"); + } + admin.setPartitionsToError(clusterId, instanceName, + node.get(PerInstanceProperties.resource.name()).textValue(), + (List) OBJECT_MAPPER.readValue( + node.get(PerInstanceProperties.partitions.name()).toString(), OBJECT_MAPPER + .getTypeFactory().constructCollectionType(List.class, String.class))); + break; case setInstanceOperation: - admin.setInstanceOperation(clusterId, instanceName, state); + InstanceUtil.setInstanceOperation(new ConfigAccessor(getRealmAwareZkClient()), + new ZkBaseDataAccessor<>(getRealmAwareZkClient()), clusterId, instanceName, + new InstanceConfig.InstanceOperation.Builder().setOperation(instanceOperation) + .setReason(reason).setSource( + force ? InstanceConstants.InstanceOperationSource.ADMIN : instanceOperationSource) + .build()); break; case canCompleteSwap: return OK(OBJECT_MAPPER.writeValueAsString( diff --git a/helix-rest/src/main/java/org/apache/helix/rest/server/resources/helix/ResourceAssignmentOptimizerAccessor.java b/helix-rest/src/main/java/org/apache/helix/rest/server/resources/helix/ResourceAssignmentOptimizerAccessor.java index 03465a9cd8..714b53f450 100644 --- a/helix-rest/src/main/java/org/apache/helix/rest/server/resources/helix/ResourceAssignmentOptimizerAccessor.java +++ b/helix-rest/src/main/java/org/apache/helix/rest/server/resources/helix/ResourceAssignmentOptimizerAccessor.java @@ -44,6 +44,7 @@ import com.fasterxml.jackson.databind.ObjectMapper; import org.apache.helix.ConfigAccessor; import org.apache.helix.HelixDataAccessor; +import org.apache.helix.constants.InstanceConstants; import org.apache.helix.controller.rebalancer.strategy.AutoRebalanceStrategy; import org.apache.helix.controller.rebalancer.strategy.RebalanceStrategy; import org.apache.helix.controller.rebalancer.waged.WagedRebalancer; @@ -225,7 +226,8 @@ private ClusterState readClusterStateAndValidateInput(String clusterId, InputFie // Throw exception if there is no instanceConfig for activatedInstances instance. for (String instance : inputFields.activatedInstances) { if (instanceConfigMap.containsKey(instance)) { - instanceConfigMap.get(instance).setInstanceEnabled(true); + instanceConfigMap.get(instance) + .setInstanceOperation(InstanceConstants.InstanceOperation.ENABLE); } else { throw new InvalidParameterException( "instance: " + instance + "does not have instanceConfig"); @@ -234,7 +236,8 @@ private ClusterState readClusterStateAndValidateInput(String clusterId, InputFie for (String instance : inputFields.deactivatedInstances) { if (instanceConfigMap.containsKey(instance)) { - instanceConfigMap.get(instance).setInstanceEnabled(false); + instanceConfigMap.get(instance) + .setInstanceOperation(InstanceConstants.InstanceOperation.DISABLE); } } diff --git a/helix-rest/src/test/java/org/apache/helix/rest/server/AbstractTestClass.java b/helix-rest/src/test/java/org/apache/helix/rest/server/AbstractTestClass.java index d0f0c57151..c6ff0d6b02 100644 --- a/helix-rest/src/test/java/org/apache/helix/rest/server/AbstractTestClass.java +++ b/helix-rest/src/test/java/org/apache/helix/rest/server/AbstractTestClass.java @@ -45,6 +45,7 @@ import org.apache.helix.PropertyPathBuilder; import org.apache.helix.PropertyType; import org.apache.helix.TestHelper; +import org.apache.helix.constants.InstanceConstants; import org.apache.helix.integration.manager.ClusterControllerManager; import org.apache.helix.integration.manager.MockParticipantManager; import org.apache.helix.integration.task.MockTask; @@ -574,7 +575,7 @@ private void preSetupForParallelInstancesStoppableTest(String clusterName, instanceConfigs.add(new InstanceConfig(instances.get(instances.size() - 1))); instanceConfigs.get(instanceConfigs.size() - 1).setDomain("helixZoneId=zone2,host=instance5"); - instanceConfigs.get(1).setInstanceEnabled(false); + instanceConfigs.get(1).setInstanceOperation(InstanceConstants.InstanceOperation.DISABLE); instanceConfigs.get(3).setInstanceEnabledForPartition("FakeResource", "FakePartition", false); for (InstanceConfig instanceConfig : instanceConfigs) { diff --git a/helix-rest/src/test/java/org/apache/helix/rest/server/TestInstancesAccessor.java b/helix-rest/src/test/java/org/apache/helix/rest/server/TestInstancesAccessor.java index 93722f05af..0403083fb7 100644 --- a/helix-rest/src/test/java/org/apache/helix/rest/server/TestInstancesAccessor.java +++ b/helix-rest/src/test/java/org/apache/helix/rest/server/TestInstancesAccessor.java @@ -359,7 +359,7 @@ public void testInstancesStoppable_disableOneInstance() throws IOException { // Disable one selected instance0, it should failed to check String instance = "instance0"; InstanceConfig instanceConfig = _configAccessor.getInstanceConfig(STOPPABLE_CLUSTER, instance); - instanceConfig.setInstanceEnabled(false); + instanceConfig.setInstanceOperation(InstanceConstants.InstanceOperation.DISABLE); instanceConfig.setInstanceEnabledForPartition("FakeResource", "FakePartition", false); _configAccessor.setInstanceConfig(STOPPABLE_CLUSTER, instance, instanceConfig); @@ -377,7 +377,7 @@ public void testInstancesStoppable_disableOneInstance() throws IOException { ImmutableSet.of("HELIX:HAS_DISABLED_PARTITION","HELIX:INSTANCE_NOT_ENABLED","HELIX:INSTANCE_NOT_STABLE","HELIX:MIN_ACTIVE_REPLICA_CHECK_FAILED")); // Reenable instance0, it should passed the check - instanceConfig.setInstanceEnabled(true); + instanceConfig.setInstanceOperation(InstanceConstants.InstanceOperation.ENABLE); instanceConfig.setInstanceEnabledForPartition("FakeResource", "FakePartition", true); _configAccessor.setInstanceConfig(STOPPABLE_CLUSTER, instance, instanceConfig); Assert.assertTrue(verifier.verifyByPolling()); diff --git a/helix-rest/src/test/java/org/apache/helix/rest/server/TestPartitionAssignmentAPI.java b/helix-rest/src/test/java/org/apache/helix/rest/server/TestPartitionAssignmentAPI.java index 2c7a46b094..e00c392b0f 100644 --- a/helix-rest/src/test/java/org/apache/helix/rest/server/TestPartitionAssignmentAPI.java +++ b/helix-rest/src/test/java/org/apache/helix/rest/server/TestPartitionAssignmentAPI.java @@ -36,6 +36,7 @@ import org.apache.helix.ConfigAccessor; import org.apache.helix.HelixDataAccessor; import org.apache.helix.TestHelper; +import org.apache.helix.constants.InstanceConstants; import org.apache.helix.controller.rebalancer.DelayedAutoRebalancer; import org.apache.helix.controller.rebalancer.strategy.CrushEdRebalanceStrategy; import org.apache.helix.controller.rebalancer.waged.WagedRebalancer; @@ -107,7 +108,7 @@ public void beforeTest() { for (int i = 0; i < DEFAULT_INSTANCE_COUNT; i++) { String instanceName = INSTANCE_NAME_PREFIX + (INSTANCE_START_PORT + i); InstanceConfig instanceConfig = new InstanceConfig(instanceName); - instanceConfig.setInstanceEnabled(true); + instanceConfig.setInstanceOperation(InstanceConstants.InstanceOperation.ENABLE); instanceConfig.setInstanceCapacityMap( Collections.singletonMap(INSTANCE_CAPACITY_KEY, DEFAULT_INSTANCE_CAPACITY)); _gSetupTool.getClusterManagementTool().addInstance(CLUSTER_NAME, instanceConfig); diff --git a/helix-rest/src/test/java/org/apache/helix/rest/server/TestPerInstanceAccessor.java b/helix-rest/src/test/java/org/apache/helix/rest/server/TestPerInstanceAccessor.java index f9e48ca1f5..6ab727e85e 100644 --- a/helix-rest/src/test/java/org/apache/helix/rest/server/TestPerInstanceAccessor.java +++ b/helix-rest/src/test/java/org/apache/helix/rest/server/TestPerInstanceAccessor.java @@ -36,13 +36,14 @@ import com.fasterxml.jackson.databind.node.ArrayNode; import com.google.common.collect.ImmutableList; import com.google.common.collect.ImmutableMap; -import org.apache.helix.ConfigAccessor; import org.apache.helix.HelixDataAccessor; +import org.apache.helix.HelixDefinedState; import org.apache.helix.HelixException; import org.apache.helix.TestHelper; import org.apache.helix.constants.InstanceConstants; import org.apache.helix.manager.zk.ZKHelixDataAccessor; import org.apache.helix.model.ClusterConfig; +import org.apache.helix.model.ExternalView; import org.apache.helix.model.IdealState; import org.apache.helix.model.InstanceConfig; import org.apache.helix.model.Message; @@ -50,9 +51,6 @@ import org.apache.helix.rest.server.resources.helix.InstancesAccessor; import org.apache.helix.rest.server.resources.helix.PerInstanceAccessor; import org.apache.helix.rest.server.util.JerseyUriRequestBuilder; -import org.apache.helix.tools.ClusterStateVerifier; -import org.apache.helix.tools.ClusterVerifiers.StrictMatchExternalViewVerifier; -import org.apache.helix.tools.ClusterVerifiers.ZkHelixClusterVerifier; import org.apache.helix.zookeeper.datamodel.ZNRecord; import org.testng.Assert; import org.testng.annotations.Test; @@ -381,7 +379,7 @@ public void testDeleteInstance() { } @Test(dependsOnMethods = "testDeleteInstance") - public void updateInstance() throws IOException { + public void updateInstance() throws Exception { System.out.println("Start test :" + TestHelper.getTestMethodName()); // Disable instance Entity entity = Entity.entity("", MediaType.APPLICATION_JSON_TYPE); @@ -465,11 +463,11 @@ public void updateInstance() throws IOException { String dbName = "_db_0_"; List partitionsToDisable = Arrays.asList(CLUSTER_NAME + dbName + "0", CLUSTER_NAME + dbName + "1", CLUSTER_NAME + dbName + "3"); + String RESOURCE_NAME = CLUSTER_NAME + dbName.substring(0, dbName.length() - 1); entity = Entity.entity( OBJECT_MAPPER.writeValueAsString(ImmutableMap.of(AbstractResource.Properties.id.name(), - INSTANCE_NAME, PerInstanceAccessor.PerInstanceProperties.resource.name(), - CLUSTER_NAME + dbName.substring(0, dbName.length() - 1), + INSTANCE_NAME, PerInstanceAccessor.PerInstanceProperties.resource.name(), RESOURCE_NAME, PerInstanceAccessor.PerInstanceProperties.partitions.name(), partitionsToDisable)), MediaType.APPLICATION_JSON_TYPE); @@ -478,13 +476,11 @@ public void updateInstance() throws IOException { InstanceConfig instanceConfig = _configAccessor.getInstanceConfig(CLUSTER_NAME, INSTANCE_NAME); Assert.assertEquals( - new HashSet<>(instanceConfig.getDisabledPartitionsMap() - .get(CLUSTER_NAME + dbName.substring(0, dbName.length() - 1))), + new HashSet<>(instanceConfig.getDisabledPartitionsMap().get(RESOURCE_NAME)), new HashSet<>(partitionsToDisable)); entity = Entity.entity(OBJECT_MAPPER.writeValueAsString(ImmutableMap .of(AbstractResource.Properties.id.name(), INSTANCE_NAME, - PerInstanceAccessor.PerInstanceProperties.resource.name(), - CLUSTER_NAME + dbName.substring(0, dbName.length() - 1), + PerInstanceAccessor.PerInstanceProperties.resource.name(), RESOURCE_NAME, PerInstanceAccessor.PerInstanceProperties.partitions.name(), ImmutableList.of(CLUSTER_NAME + dbName + "1"))), MediaType.APPLICATION_JSON_TYPE); @@ -492,23 +488,22 @@ public void updateInstance() throws IOException { .format(CLUSTER_NAME, INSTANCE_NAME).post(this, entity); instanceConfig = _configAccessor.getInstanceConfig(CLUSTER_NAME, INSTANCE_NAME); - Assert.assertEquals(new HashSet<>(instanceConfig.getDisabledPartitionsMap() - .get(CLUSTER_NAME + dbName.substring(0, dbName.length() - 1))), + Assert.assertEquals(new HashSet<>(instanceConfig.getDisabledPartitionsMap().get(RESOURCE_NAME)), new HashSet<>(Arrays.asList(CLUSTER_NAME + dbName + "0", CLUSTER_NAME + dbName + "3"))); // test set instance operation new JerseyUriRequestBuilder("clusters/{}/instances/{}?command=setInstanceOperation&instanceOperation=EVACUATE") .format(CLUSTER_NAME, INSTANCE_NAME).post(this, entity); instanceConfig = _configAccessor.getInstanceConfig(CLUSTER_NAME, INSTANCE_NAME); - Assert.assertEquals( - instanceConfig.getInstanceOperation(), InstanceConstants.InstanceOperation.EVACUATE.toString()); + Assert.assertEquals(instanceConfig.getInstanceOperation().getOperation(), + InstanceConstants.InstanceOperation.EVACUATE); new JerseyUriRequestBuilder("clusters/{}/instances/{}?command=setInstanceOperation&instanceOperation=INVALIDOP") .expectedReturnStatusCode(Response.Status.NOT_FOUND.getStatusCode()).format(CLUSTER_NAME, INSTANCE_NAME).post(this, entity); new JerseyUriRequestBuilder("clusters/{}/instances/{}?command=setInstanceOperation&instanceOperation=") .format(CLUSTER_NAME, INSTANCE_NAME).post(this, entity); instanceConfig = _configAccessor.getInstanceConfig(CLUSTER_NAME, INSTANCE_NAME); - Assert.assertEquals( - instanceConfig.getInstanceOperation(), ""); + Assert.assertEquals(instanceConfig.getInstanceOperation().getOperation(), + InstanceConstants.InstanceOperation.ENABLE); // test canCompleteSwap Response canCompleteSwapResponse = @@ -548,8 +543,8 @@ public void updateInstance() throws IOException { new JerseyUriRequestBuilder("clusters/{}/instances/{}?command=setInstanceOperation&instanceOperation=EVACUATE") .format(CLUSTER_NAME, INSTANCE_NAME).post(this, entity); instanceConfig = _configAccessor.getInstanceConfig(CLUSTER_NAME, INSTANCE_NAME); - Assert.assertEquals( - instanceConfig.getInstanceOperation(), InstanceConstants.InstanceOperation.EVACUATE.toString()); + Assert.assertEquals(instanceConfig.getInstanceOperation().getOperation(), + InstanceConstants.InstanceOperation.EVACUATE); Response response = new JerseyUriRequestBuilder("clusters/{}/instances/{}?command=isEvacuateFinished") .format(CLUSTER_NAME, INSTANCE_NAME).post(this, entity); @@ -591,14 +586,40 @@ public void updateInstance() throws IOException { new JerseyUriRequestBuilder("clusters/{}/instances/{}?command=setInstanceOperation&instanceOperation=EVACUATE") .format(CLUSTER_NAME, test_instance_name).post(this, entity); instanceConfig = _configAccessor.getInstanceConfig(CLUSTER_NAME, test_instance_name); - Assert.assertEquals( - instanceConfig.getInstanceOperation(), InstanceConstants.InstanceOperation.EVACUATE.toString()); + Assert.assertEquals(instanceConfig.getInstanceOperation().getOperation(), + InstanceConstants.InstanceOperation.EVACUATE); response = new JerseyUriRequestBuilder("clusters/{}/instances/{}?command=isEvacuateFinished") .format(CLUSTER_NAME, test_instance_name).post(this, entity); evacuateFinishedResult = OBJECT_MAPPER.readValue(response.readEntity(String.class), Map.class); Assert.assertEquals(response.getStatus(), Response.Status.OK.getStatusCode()); Assert.assertTrue(evacuateFinishedResult.get("successful")); + + // test setPartitionsToError + List partitionsToSetToError = Arrays.asList(CLUSTER_NAME + dbName + "7"); + + entity = Entity.entity( + OBJECT_MAPPER.writeValueAsString(ImmutableMap.of(AbstractResource.Properties.id.name(), + INSTANCE_NAME, PerInstanceAccessor.PerInstanceProperties.resource.name(), RESOURCE_NAME, + PerInstanceAccessor.PerInstanceProperties.partitions.name(), partitionsToSetToError)), + MediaType.APPLICATION_JSON_TYPE); + + response = new JerseyUriRequestBuilder("clusters/{}/instances/{}?command=setPartitionsToError") + .format(CLUSTER_NAME, INSTANCE_NAME).post(this, entity); + + Assert.assertEquals(response.getStatus(), Response.Status.OK.getStatusCode()); + + TestHelper.verify(() -> { + ExternalView externalView = _gSetupTool.getClusterManagementTool() + .getResourceExternalView(CLUSTER_NAME, RESOURCE_NAME); + Set responseForAllPartitions = new HashSet(); + for (String partition : partitionsToSetToError) { + responseForAllPartitions.add(externalView.getStateMap(partition) + .get(INSTANCE_NAME) == HelixDefinedState.ERROR.toString()); + } + return !responseForAllPartitions.contains(Boolean.FALSE); + }, TestHelper.WAIT_DURATION); + System.out.println("End test :" + TestHelper.getTestMethodName()); } diff --git a/helix-rest/src/test/java/org/apache/helix/rest/server/TestResourceAssignmentOptimizerAccessor.java b/helix-rest/src/test/java/org/apache/helix/rest/server/TestResourceAssignmentOptimizerAccessor.java index 899256619f..7d49318f02 100644 --- a/helix-rest/src/test/java/org/apache/helix/rest/server/TestResourceAssignmentOptimizerAccessor.java +++ b/helix-rest/src/test/java/org/apache/helix/rest/server/TestResourceAssignmentOptimizerAccessor.java @@ -34,6 +34,7 @@ import com.fasterxml.jackson.core.type.TypeReference; import org.apache.helix.HelixDataAccessor; import org.apache.helix.TestHelper; +import org.apache.helix.constants.InstanceConstants; import org.apache.helix.manager.zk.ZKHelixDataAccessor; import org.apache.helix.model.IdealState; import org.apache.helix.model.InstanceConfig; @@ -68,7 +69,7 @@ public void beforeClass() { toEnabledInstance = liveInstances.get(2); InstanceConfig config = _gSetupTool.getClusterManagementTool() .getInstanceConfig(cluster, toEnabledInstance); - config.setInstanceEnabled(false); + config.setInstanceOperation(InstanceConstants.InstanceOperation.DISABLE); _gSetupTool.getClusterManagementTool() .setInstanceConfig(cluster, toEnabledInstance, config); @@ -94,7 +95,7 @@ public void afterClass() { } InstanceConfig config = _gSetupTool.getClusterManagementTool() .getInstanceConfig(cluster, toEnabledInstance); - config.setInstanceEnabled(true); + config.setInstanceOperation(InstanceConstants.InstanceOperation.DISABLE); _gSetupTool.getClusterManagementTool().setInstanceConfig(cluster, toEnabledInstance, config); _gSetupTool.getClusterManagementTool() .enableMaintenanceMode(cluster, false, TestHelper.getTestMethodName()); @@ -245,8 +246,8 @@ public void testComputePartitionAssignmentWaged() throws IOException { InstanceConfig toEnabledInstanceConfig = _gSetupTool.getClusterManagementTool().getInstanceConfig(cluster, toEnabledInstance); // Another way to mark the node as inactive or active. - toDeactivatedInstanceConfig.setInstanceEnabled(false); - toEnabledInstanceConfig.setInstanceEnabled(true); + toDeactivatedInstanceConfig.setInstanceOperation(InstanceConstants.InstanceOperation.DISABLE); + toEnabledInstanceConfig.setInstanceOperation(InstanceConstants.InstanceOperation.ENABLE); // Write the current InstanceConfigs record to json string StringWriter sw = new StringWriter(); OBJECT_MAPPER.writeValue(sw, toDeactivatedInstanceConfig.getRecord()); diff --git a/meta-client/pom.xml b/meta-client/pom.xml index 29092ef1e8..a4762eb371 100644 --- a/meta-client/pom.xml +++ b/meta-client/pom.xml @@ -89,6 +89,54 @@ under the License. + + org.apache.maven.plugins + maven-compiler-plugin + 3.12.1 + + + JDK 8 + compile + + compile + + + ${project.build.outputDirectory}_jdk8 + 8 + true + + + + JDK 11 + compile + + compile + + + 11 + true + + + + + + org.apache.maven.plugins + maven-jar-plugin + 3.3.0 + + + default-package-jdk11 + package + + jar + + + ${project.build.outputDirectory}_jdk8 + jdk8 + + + + org.apache.maven.plugins maven-assembly-plugin diff --git a/meta-client/src/main/java/org/apache/helix/metaclient/recipes/leaderelection/LeaderElectionClient.java b/meta-client/src/main/java/org/apache/helix/metaclient/recipes/leaderelection/LeaderElectionClient.java index 3bcf09ceb3..ae7d9c9fab 100644 --- a/meta-client/src/main/java/org/apache/helix/metaclient/recipes/leaderelection/LeaderElectionClient.java +++ b/meta-client/src/main/java/org/apache/helix/metaclient/recipes/leaderelection/LeaderElectionClient.java @@ -97,7 +97,13 @@ public LeaderElectionClient(MetaClientConfig metaClientConfig, String participan LOG.info("Creating MetaClient for LeaderElectionClient"); if (MetaClientConfig.StoreType.ZOOKEEPER.equals(metaClientConfig.getStoreType())) { ZkMetaClientConfig zkMetaClientConfig = new ZkMetaClientConfig.ZkMetaClientConfigBuilder().setConnectionAddress( - metaClientConfig.getConnectionAddress()).setZkSerializer((new LeaderInfoSerializer())).build(); + metaClientConfig.getConnectionAddress()) + .setZkSerializer((new LeaderInfoSerializer())) + .setSessionTimeoutInMillis(metaClientConfig.getSessionTimeoutInMillis()) + .setMetaClientReconnectPolicy(metaClientConfig.getMetaClientReconnectPolicy()) + .setConnectionInitTimeoutInMillis(metaClientConfig.getConnectionInitTimeoutInMillis()) + .setAuthEnabled(metaClientConfig.isAuthEnabled()) + .build(); _metaClient = new ZkMetaClientFactory().getMetaClient(zkMetaClientConfig); _metaClient.connect(); _metaClient.subscribeStateChanges(_connectStateListener); @@ -121,7 +127,8 @@ public LeaderElectionClient(MetaClientInterface metaClient, String p * Returns true if current participant is the current leadership. */ public boolean isLeader(String leaderPath) { - return getLeader(leaderPath).equalsIgnoreCase(_participant); + String leader = getLeader(leaderPath); + return leader != null && leader.equalsIgnoreCase(_participant); } /** diff --git a/meta-client/src/test/java/org/apache/helix/metaclient/recipes/leaderelection/TestLeaderElection.java b/meta-client/src/test/java/org/apache/helix/metaclient/recipes/leaderelection/TestLeaderElection.java index 75917623c7..248643652a 100644 --- a/meta-client/src/test/java/org/apache/helix/metaclient/recipes/leaderelection/TestLeaderElection.java +++ b/meta-client/src/test/java/org/apache/helix/metaclient/recipes/leaderelection/TestLeaderElection.java @@ -41,7 +41,21 @@ public void cleanUp() { } } + // Test that calling isLeader before client joins LeaderElectionParticipantPool returns false and does not throw NPE @Test + public void testIsLeaderBeforeJoiningParticipantPool() throws Exception { + String leaderPath = LEADER_PATH + "/testIsLeaderBeforeJoiningPool"; + LeaderElectionClient clt1 = createLeaderElectionClient(PARTICIPANT_NAME1); + try { + boolean isLeader = clt1.isLeader(leaderPath); + Assert.assertFalse(isLeader, "Expected isLeader to return false before joining participant pool"); + } catch (NullPointerException npe) { + Assert.fail("isLeader threw NPE before joining participant pool: " + npe.getMessage()); + } + clt1.close(); + } + + @Test (dependsOnMethods = "testIsLeaderBeforeJoiningParticipantPool") public void testAcquireLeadership() throws Exception { System.out.println("START TestLeaderElection.testAcquireLeadership"); String leaderPath = LEADER_PATH + "/testAcquireLeadership"; diff --git a/metadata-store-directory-common/pom.xml b/metadata-store-directory-common/pom.xml index f173397cc4..98e2e5cb03 100644 --- a/metadata-store-directory-common/pom.xml +++ b/metadata-store-directory-common/pom.xml @@ -113,6 +113,54 @@ + + org.apache.maven.plugins + maven-compiler-plugin + 3.12.1 + + + JDK 8 + compile + + compile + + + ${project.build.outputDirectory}_jdk8 + 8 + true + + + + JDK 11 + compile + + compile + + + 11 + true + + + + + + org.apache.maven.plugins + maven-jar-plugin + 3.3.0 + + + default-package-jdk11 + package + + jar + + + ${project.build.outputDirectory}_jdk8 + jdk8 + + + + org.apache.maven.plugins maven-assembly-plugin diff --git a/metrics-common/pom.xml b/metrics-common/pom.xml index 2dbe016cb2..433c575aac 100644 --- a/metrics-common/pom.xml +++ b/metrics-common/pom.xml @@ -84,6 +84,54 @@ + + org.apache.maven.plugins + maven-compiler-plugin + 3.12.1 + + + JDK 8 + compile + + compile + + + ${project.build.outputDirectory}_jdk8 + 8 + true + + + + JDK 11 + compile + + compile + + + 11 + true + + + + + + org.apache.maven.plugins + maven-jar-plugin + 3.3.0 + + + default-package-jdk11 + package + + jar + + + ${project.build.outputDirectory}_jdk8 + jdk8 + + + + org.apache.maven.plugins maven-assembly-plugin diff --git a/recipes/rabbitmq-consumer-group/src/main/java/org/apache/helix/recipes/rabbitmq/Consumer.java b/recipes/rabbitmq-consumer-group/src/main/java/org/apache/helix/recipes/rabbitmq/Consumer.java index 998f1175a0..eddd68b387 100644 --- a/recipes/rabbitmq-consumer-group/src/main/java/org/apache/helix/recipes/rabbitmq/Consumer.java +++ b/recipes/rabbitmq-consumer-group/src/main/java/org/apache/helix/recipes/rabbitmq/Consumer.java @@ -24,6 +24,7 @@ import org.apache.helix.HelixManager; import org.apache.helix.HelixManagerFactory; import org.apache.helix.InstanceType; +import org.apache.helix.constants.InstanceConstants; import org.apache.helix.manager.zk.ZKHelixAdmin; import org.apache.helix.manager.zk.ZNRecordSerializer; import org.apache.helix.zookeeper.impl.client.ZkClient; @@ -98,7 +99,7 @@ public static void main(String[] args) throws Exception { if (!nodes.contains("consumer_" + consumerId)) { InstanceConfig config = new InstanceConfig("consumer_" + consumerId); config.setHostName("localhost"); - config.setInstanceEnabled(true); + config.setInstanceOperation(InstanceConstants.InstanceOperation.ENABLE); admin.addInstance(clusterName, config); } diff --git a/recipes/rsync-replicated-file-system/src/main/java/org/apache/helix/filestore/SetupCluster.java b/recipes/rsync-replicated-file-system/src/main/java/org/apache/helix/filestore/SetupCluster.java index 9dbcbfe048..5b8e736a97 100644 --- a/recipes/rsync-replicated-file-system/src/main/java/org/apache/helix/filestore/SetupCluster.java +++ b/recipes/rsync-replicated-file-system/src/main/java/org/apache/helix/filestore/SetupCluster.java @@ -19,6 +19,7 @@ * under the License. */ +import org.apache.helix.constants.InstanceConstants; import org.apache.helix.manager.zk.ZKHelixAdmin; import org.apache.helix.manager.zk.ZNRecordSerializer; import org.apache.helix.zookeeper.impl.client.ZkClient; @@ -65,7 +66,7 @@ public static void main(String[] args) { InstanceConfig config = new InstanceConfig(serverId); config.setHostName("localhost"); config.setPort(port); - config.setInstanceEnabled(true); + config.setInstanceOperation(InstanceConstants.InstanceOperation.ENABLE); admin.addInstance(clusterName, config); } // add resource "repository" which has 1 partition diff --git a/recipes/task-execution/src/main/java/org/apache/helix/taskexecution/Worker.java b/recipes/task-execution/src/main/java/org/apache/helix/taskexecution/Worker.java index 11d43953b4..3704c5406d 100644 --- a/recipes/task-execution/src/main/java/org/apache/helix/taskexecution/Worker.java +++ b/recipes/task-execution/src/main/java/org/apache/helix/taskexecution/Worker.java @@ -24,6 +24,7 @@ import org.apache.helix.HelixManager; import org.apache.helix.HelixManagerFactory; import org.apache.helix.InstanceType; +import org.apache.helix.constants.InstanceConstants; import org.apache.helix.manager.zk.ZKHelixAdmin; import org.apache.helix.manager.zk.ZNRecordSerializer; import org.apache.helix.zookeeper.impl.client.ZkClient; @@ -98,7 +99,7 @@ public void run() { if (!nodes.contains(_instanceName)) { InstanceConfig config = new InstanceConfig(_instanceName); config.setHostName("localhost"); - config.setInstanceEnabled(true); + config.setInstanceOperation(InstanceConstants.InstanceOperation.ENABLE); admin.addInstance(_clusterName, config); } diff --git a/zookeeper-api/pom.xml b/zookeeper-api/pom.xml index d44160fdb7..bfb993feec 100644 --- a/zookeeper-api/pom.xml +++ b/zookeeper-api/pom.xml @@ -133,6 +133,54 @@ + + org.apache.maven.plugins + maven-compiler-plugin + 3.12.1 + + + JDK 8 + compile + + compile + + + ${project.build.outputDirectory}_jdk8 + 8 + true + + + + JDK 11 + compile + + compile + + + 11 + true + + + + + + org.apache.maven.plugins + maven-jar-plugin + 3.3.0 + + + default-package-jdk11 + package + + jar + + + ${project.build.outputDirectory}_jdk8 + jdk8 + + + + org.apache.maven.plugins maven-assembly-plugin