From e90f4a5ec773236ac9177aad1d5f0dd7986968a0 Mon Sep 17 00:00:00 2001 From: xyuanlu Date: Wed, 20 Sep 2023 10:58:08 -0700 Subject: [PATCH 1/2] Exclude on-operation instance from computing min active replica in WAGED. (#2621) Exclude on-operation instance from computing min active replica in WAGED. --- .../rebalancer/waged/WagedRebalancer.java | 15 ++++++- .../rebalancer/TestInstanceOperation.java | 42 ++++++++++++++----- 2 files changed, 44 insertions(+), 13 deletions(-) diff --git a/helix-core/src/main/java/org/apache/helix/controller/rebalancer/waged/WagedRebalancer.java b/helix-core/src/main/java/org/apache/helix/controller/rebalancer/waged/WagedRebalancer.java index 6c1c4d74d9..e717aa9962 100644 --- a/helix-core/src/main/java/org/apache/helix/controller/rebalancer/waged/WagedRebalancer.java +++ b/helix-core/src/main/java/org/apache/helix/controller/rebalancer/waged/WagedRebalancer.java @@ -47,6 +47,7 @@ import org.apache.helix.controller.stages.CurrentStateOutput; import org.apache.helix.model.ClusterConfig; import org.apache.helix.model.IdealState; +import org.apache.helix.model.InstanceConfig; import org.apache.helix.model.Partition; import org.apache.helix.model.Resource; import org.apache.helix.model.ResourceAssignment; @@ -395,7 +396,8 @@ private Map handleDelayedRebalanceMinActiveReplica( Map currentResourceAssignment, RebalanceAlgorithm algorithm) throws HelixRebalanceException { // the "real" live nodes at the time - final Set enabledLiveInstances = clusterData.getEnabledLiveInstances(); + // TODO: this is a hacky way to filter our on operation instance. We should consider redesign `getEnabledLiveInstances()`. + final Set enabledLiveInstances = filterOutOnOperationInstances(clusterData.getInstanceConfigMap(), clusterData.getEnabledLiveInstances()); if (activeNodes.equals(enabledLiveInstances) || !requireRebalanceOverwrite(clusterData, currentResourceAssignment)) { // no need for additional process, return the current resource assignment return currentResourceAssignment; @@ -424,6 +426,14 @@ private Map handleDelayedRebalanceMinActiveReplica( } } + private static Set filterOutOnOperationInstances(Map instanceConfigMap, + Set nodes) { + return nodes.stream() + .filter( + instance -> !DelayedAutoRebalancer.INSTANCE_OPERATION_TO_EXCLUDE_FROM_ASSIGNMENT.contains(instanceConfigMap.get(instance).getInstanceOperation())) + .collect(Collectors.toSet()); + } + /** * Emergency rebalance is scheduled to quickly handle urgent cases like reassigning partitions from inactive nodes * and addressing for partitions failing to meet minActiveReplicas. @@ -608,7 +618,8 @@ protected boolean requireRebalanceOverwrite(ResourceControllerDataProvider clust bestPossibleAssignment.values().parallelStream().forEach((resourceAssignment -> { String resourceName = resourceAssignment.getResourceName(); IdealState currentIdealState = clusterData.getIdealState(resourceName); - Set enabledLiveInstances = clusterData.getEnabledLiveInstances(); + Set enabledLiveInstances = + filterOutOnOperationInstances(clusterData.getInstanceConfigMap(), clusterData.getEnabledLiveInstances()); int numReplica = currentIdealState.getReplicaCount(enabledLiveInstances.size()); int minActiveReplica = DelayedRebalanceUtil.getMinActiveReplica(ResourceConfig .mergeIdealStateWithResourceConfig(clusterData.getResourceConfig(resourceName), diff --git a/helix-core/src/test/java/org/apache/helix/integration/rebalancer/TestInstanceOperation.java b/helix-core/src/test/java/org/apache/helix/integration/rebalancer/TestInstanceOperation.java index 6c51d58bbc..c9c9c75979 100644 --- a/helix-core/src/test/java/org/apache/helix/integration/rebalancer/TestInstanceOperation.java +++ b/helix-core/src/test/java/org/apache/helix/integration/rebalancer/TestInstanceOperation.java @@ -15,6 +15,7 @@ import org.apache.helix.HelixDataAccessor; import org.apache.helix.HelixRollbackException; import org.apache.helix.NotificationContext; +import org.apache.helix.PropertyPathBuilder; import org.apache.helix.TestHelper; import org.apache.helix.common.ZkTestBase; import org.apache.helix.constants.InstanceConstants; @@ -89,9 +90,11 @@ public void beforeClass() throws Exception { ClusterConfig clusterConfig = _configAccessor.getClusterConfig(CLUSTER_NAME); clusterConfig.stateTransitionCancelEnabled(true); + clusterConfig.setDelayRebalaceEnabled(true); + clusterConfig.setRebalanceDelayTime(1800000L); _configAccessor.setClusterConfig(CLUSTER_NAME, clusterConfig); - createTestDBs(200); + createTestDBs(1800000L); setUpWagedBaseline(); @@ -199,7 +202,7 @@ public void testAddingNodeWithEvacuationTag() throws Exception { public void testEvacuateAndCancelBeforeBootstrapFinish() throws Exception { // add a resource where downward state transition is slow createResourceWithDelayedRebalance(CLUSTER_NAME, "TEST_DB3_DELAYED_CRUSHED", "MasterSlave", PARTITIONS, REPLICA, - REPLICA - 1, 200, CrushEdRebalanceStrategy.class.getName()); + REPLICA - 1, 200000, CrushEdRebalanceStrategy.class.getName()); _allDBs.add("TEST_DB3_DELAYED_CRUSHED"); // add a resource where downward state transition is slow createResourceWithWagedRebalance(CLUSTER_NAME, "TEST_DB4_DELAYED_WAGED", "MasterSlave", @@ -338,21 +341,38 @@ public void testMarkEvacuationAfterEMM() throws Exception { @Test(dependsOnMethods = "testMarkEvacuationAfterEMM") public void testEvacuationWithOfflineInstancesInCluster() throws Exception { + _participants.get(1).syncStop(); _participants.get(2).syncStop(); - _participants.get(3).syncStop(); - // wait for converge, and set evacuate on instance 0 - Assert.assertTrue(_clusterVerifier.verifyByPolling()); - String evacuateInstanceName = _participants.get(0).getInstanceName(); + String evacuateInstanceName = _participants.get(_participants.size()-2).getInstanceName(); _gSetupTool.getClusterManagementTool() .setInstanceOperation(CLUSTER_NAME, evacuateInstanceName, InstanceConstants.InstanceOperation.EVACUATE); - Map assignment; - List currentActiveInstances = - _participantNames.stream().filter(n -> (!n.equals(evacuateInstanceName) && !n.equals(_participants.get(3).getInstanceName()))).collect(Collectors.toList()); - TestHelper.verify( ()-> {return verifyIS(evacuateInstanceName);}, TestHelper.WAIT_DURATION); + Map assignment; + // EV should contain all participants, check resources one by one + assignment = getEV(); + for (String resource : _allDBs) { + ExternalView ev = assignment.get(resource); + for (String partition : ev.getPartitionSet()) { + AtomicInteger activeReplicaCount = new AtomicInteger(); + ev.getStateMap(partition) + .values() + .stream() + .filter( + v -> v.equals("MASTER") || v.equals("LEADER") || v.equals("SLAVE") || v.equals("FOLLOWER") || v.equals( + "STANDBY")) + .forEach(v -> activeReplicaCount.getAndIncrement()); + Assert.assertTrue(activeReplicaCount.get() >= REPLICA-1); + Assert.assertFalse(ev.getStateMap(partition).containsKey(evacuateInstanceName) && ev.getStateMap(partition) + .get(evacuateInstanceName) + .equals("MASTER") && ev.getStateMap(partition) + .get(evacuateInstanceName) + .equals("LEADER")); + + } + } - _participants.get(3).syncStart(); + _participants.get(1).syncStart(); _participants.get(2).syncStart(); } From 83920d7bfc3a87029bfb425435ba81c95f57b263 Mon Sep 17 00:00:00 2001 From: xyuanlu Date: Thu, 12 Oct 2023 11:59:55 -0700 Subject: [PATCH 2/2] property --- .../HelixRestDataProviderManager.java | 68 ++++++++++++++ .../dataprovider/PerClusterDataProvider.java | 91 +++++++++++++++++++ .../dataprovider/RestCurrentStateCache.java | 42 +++++++++ .../dataprovider/RestPropertyCache.java | 82 +++++++++++++++++ .../dataprovider/RestServiceDataProvider.java | 43 +++++++++ 5 files changed, 326 insertions(+) create mode 100644 helix-rest/src/main/java/org/apache/helix/rest/common/dataprovider/HelixRestDataProviderManager.java create mode 100644 helix-rest/src/main/java/org/apache/helix/rest/common/dataprovider/PerClusterDataProvider.java create mode 100644 helix-rest/src/main/java/org/apache/helix/rest/common/dataprovider/RestCurrentStateCache.java create mode 100644 helix-rest/src/main/java/org/apache/helix/rest/common/dataprovider/RestPropertyCache.java create mode 100644 helix-rest/src/main/java/org/apache/helix/rest/common/dataprovider/RestServiceDataProvider.java diff --git a/helix-rest/src/main/java/org/apache/helix/rest/common/dataprovider/HelixRestDataProviderManager.java b/helix-rest/src/main/java/org/apache/helix/rest/common/dataprovider/HelixRestDataProviderManager.java new file mode 100644 index 0000000000..58d59fc87d --- /dev/null +++ b/helix-rest/src/main/java/org/apache/helix/rest/common/dataprovider/HelixRestDataProviderManager.java @@ -0,0 +1,68 @@ +package org.apache.helix.rest.common.dataprovider; + +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + * + */ + +/** Init, register listener and manager callback handler for different + * clusters manage the providers lifecycle + * + */ + +import java.util.List; +import org.apache.helix.HelixAdmin; +import org.apache.helix.manager.zk.ZkBaseDataAccessor; +import org.apache.helix.zookeeper.api.client.RealmAwareZkClient; + + +public class HelixRestDataProviderManager { + + protected RealmAwareZkClient _zkclient; + + private HelixAdmin _helixAdmin; + + private RestServiceDataProvider _restServiceDataProvider; + // list of callback handlers + + //TODO: create own zk client + public HelixRestDataProviderManager(RealmAwareZkClient zkclient, HelixAdmin helixAdmin) { + _zkclient = zkclient; + _helixAdmin = helixAdmin; + _restServiceDataProvider = new RestServiceDataProvider(); + init(); + } + + public RestServiceDataProvider getRestServiceDataProvider() { + return _restServiceDataProvider; + } + + private void init() { + List clusters = _helixAdmin.getClusters(); + for (String cluster : clusters) { + PerClusterDataProvider clusterDataProvider = + new PerClusterDataProvider(cluster, _zkclient, new ZkBaseDataAccessor(_zkclient)); + clusterDataProvider.initCache(); + // register callback handler for each provider + _restServiceDataProvider.addClusterDataProvider(cluster, clusterDataProvider); + } + } + + public void close() { + } +} diff --git a/helix-rest/src/main/java/org/apache/helix/rest/common/dataprovider/PerClusterDataProvider.java b/helix-rest/src/main/java/org/apache/helix/rest/common/dataprovider/PerClusterDataProvider.java new file mode 100644 index 0000000000..adcde81f28 --- /dev/null +++ b/helix-rest/src/main/java/org/apache/helix/rest/common/dataprovider/PerClusterDataProvider.java @@ -0,0 +1,91 @@ +package org.apache.helix.rest.common.dataprovider; + +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +import java.util.Map; +import org.apache.helix.BaseDataAccessor; +import org.apache.helix.HelixDataAccessor; +import org.apache.helix.PropertyKey; +import org.apache.helix.common.caches.PropertyCache; +import org.apache.helix.manager.zk.ZKHelixDataAccessor; +import org.apache.helix.model.ClusterConfig; +import org.apache.helix.model.ClusterConstraints; +import org.apache.helix.model.CurrentState; +import org.apache.helix.model.ExternalView; +import org.apache.helix.model.IdealState; +import org.apache.helix.model.InstanceConfig; +import org.apache.helix.model.LiveInstance; +import org.apache.helix.model.ResourceConfig; +import org.apache.helix.model.StateModelDefinition; +import org.apache.helix.zookeeper.api.client.RealmAwareZkClient; + + +/** + * Dara cache for each Helix cluster. Configs, ideal stats and current states are read from ZK and updated + * using event changes. External view are consolidated using current state. + */ +public class PerClusterDataProvider { + + private HelixDataAccessor _accessor; + + private RealmAwareZkClient _zkclient; + + private final String _clusterName; + + // Simple caches + private final RestPropertyCache _instanceConfigCache; + private final RestPropertyCache _clusterConfigCache; + private final RestPropertyCache _resourceConfigCache; + private final RestPropertyCache _liveInstanceCache; + private final RestPropertyCache _idealStateCache; + private final RestPropertyCache _stateModelDefinitionCache; + + // special caches + private final RestCurrentStateCache _currentStateCache; + + // TODO: add external view caches + + public PerClusterDataProvider(String clusterName, RealmAwareZkClient zkClient, BaseDataAccessor baseDataAccessor) { + _clusterName = clusterName; + _accessor = new ZKHelixDataAccessor(clusterName, baseDataAccessor); + + _zkclient = zkClient; + _instanceConfigCache = null; + _clusterConfigCache = null; + _resourceConfigCache = null; + _liveInstanceCache = null; + _idealStateCache = null; + _stateModelDefinitionCache = null; + _currentStateCache = null; + } + // TODO: consolidate EV from CSs + public Map consolidateExternalViews() { + return null; + } + + // Used for dummy cache. Remove later + public void initCache(final HelixDataAccessor accessor) { + + } + + public void initCache() { + + } +} diff --git a/helix-rest/src/main/java/org/apache/helix/rest/common/dataprovider/RestCurrentStateCache.java b/helix-rest/src/main/java/org/apache/helix/rest/common/dataprovider/RestCurrentStateCache.java new file mode 100644 index 0000000000..c92f4ce5b7 --- /dev/null +++ b/helix-rest/src/main/java/org/apache/helix/rest/common/dataprovider/RestCurrentStateCache.java @@ -0,0 +1,42 @@ +package org.apache.helix.rest.common.dataprovider; + +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +import java.util.concurrent.ConcurrentHashMap; +import org.apache.helix.HelixDataAccessor; +import org.apache.helix.model.CurrentState; + + +/** + * Special cache for instances current states. + * + */ +public class RestCurrentStateCache { + + //Map> + private ConcurrentHashMap> _objCache; + + public RestCurrentStateCache() { + _objCache = new ConcurrentHashMap<>(); + } + + public void init(final HelixDataAccessor accessor) { + } +} \ No newline at end of file diff --git a/helix-rest/src/main/java/org/apache/helix/rest/common/dataprovider/RestPropertyCache.java b/helix-rest/src/main/java/org/apache/helix/rest/common/dataprovider/RestPropertyCache.java new file mode 100644 index 0000000000..7cc129f136 --- /dev/null +++ b/helix-rest/src/main/java/org/apache/helix/rest/common/dataprovider/RestPropertyCache.java @@ -0,0 +1,82 @@ +package org.apache.helix.rest.common.dataprovider; + +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +import java.util.Collections; +import java.util.Map; +import java.util.concurrent.ConcurrentHashMap; +import org.apache.helix.HelixDataAccessor; +import org.apache.helix.HelixProperty; +import org.apache.helix.PropertyKey; +import org.apache.helix.common.caches.PropertyCache; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + + +/** + * Class for caching simple HelixProperty objects. + * @param + */ +public class RestPropertyCache { + private static final Logger LOG = LoggerFactory.getLogger(RestPropertyCache.class); + + private ConcurrentHashMap _objCache; + private final String _propertyDescription; + + private final RestPropertyCache.PropertyCacheKeyFuncs _keyFuncs; + + public interface PropertyCacheKeyFuncs { + /** + * Get PropertyKey for the root of this type of object, used for LIST all objects + * @return property key to object root + */ + PropertyKey getRootKey(HelixDataAccessor accessor); + + /** + * Get PropertyKey for a single object of this type, used for GET single instance of the type + * @param objName object name + * @return property key to the object instance + */ + PropertyKey getObjPropertyKey(HelixDataAccessor accessor, String objName); + + /** + * Get the string to identify the object when we actually use them. It's not necessarily the + * "id" field of HelixProperty, but could have more semantic meanings of that object type + * @param obj object instance + * @return object identifier + */ + String getObjName(O obj); + } + + public RestPropertyCache(String propertyDescription, RestPropertyCache.PropertyCacheKeyFuncs keyFuncs) { + _keyFuncs = keyFuncs; + _propertyDescription = propertyDescription; + } + + public void init(final HelixDataAccessor accessor) { + _objCache = new ConcurrentHashMap<>(accessor.getChildValuesMap(_keyFuncs.getRootKey(accessor), true)); + LOG.info("Init RestPropertyCache for {}. ", _propertyDescription); + } + + public Map getPropertyMap() { + return Collections.unmodifiableMap(_objCache); + } + +} diff --git a/helix-rest/src/main/java/org/apache/helix/rest/common/dataprovider/RestServiceDataProvider.java b/helix-rest/src/main/java/org/apache/helix/rest/common/dataprovider/RestServiceDataProvider.java new file mode 100644 index 0000000000..d880dbf7ce --- /dev/null +++ b/helix-rest/src/main/java/org/apache/helix/rest/common/dataprovider/RestServiceDataProvider.java @@ -0,0 +1,43 @@ +package org.apache.helix.rest.common.dataprovider; + +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +import java.util.HashMap; +import java.util.Map; + + +public class RestServiceDataProvider { + protected Map _clusterDataProviders ; + + + public RestServiceDataProvider() { + _clusterDataProviders = new HashMap<>(); + } + + public PerClusterDataProvider getClusterData(String clusterName) { + return _clusterDataProviders.get(clusterName); + } + + public void addClusterDataProvider(String clusterName, PerClusterDataProvider clusterData) { + _clusterDataProviders.put(clusterName, clusterData); + } + + +}