Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Rest cache - property cache #2658

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@
import org.apache.helix.controller.stages.CurrentStateOutput;
import org.apache.helix.model.ClusterConfig;
import org.apache.helix.model.IdealState;
import org.apache.helix.model.InstanceConfig;
import org.apache.helix.model.Partition;
import org.apache.helix.model.Resource;
import org.apache.helix.model.ResourceAssignment;
Expand Down Expand Up @@ -395,7 +396,8 @@ private Map<String, ResourceAssignment> handleDelayedRebalanceMinActiveReplica(
Map<String, ResourceAssignment> currentResourceAssignment,
RebalanceAlgorithm algorithm) throws HelixRebalanceException {
// the "real" live nodes at the time
final Set<String> enabledLiveInstances = clusterData.getEnabledLiveInstances();
// TODO: this is a hacky way to filter our on operation instance. We should consider redesign `getEnabledLiveInstances()`.
final Set<String> enabledLiveInstances = filterOutOnOperationInstances(clusterData.getInstanceConfigMap(), clusterData.getEnabledLiveInstances());
if (activeNodes.equals(enabledLiveInstances) || !requireRebalanceOverwrite(clusterData, currentResourceAssignment)) {
// no need for additional process, return the current resource assignment
return currentResourceAssignment;
Expand Down Expand Up @@ -424,6 +426,14 @@ private Map<String, ResourceAssignment> handleDelayedRebalanceMinActiveReplica(
}
}

private static Set<String> filterOutOnOperationInstances(Map<String, InstanceConfig> instanceConfigMap,
Set<String> nodes) {
return nodes.stream()
.filter(
instance -> !DelayedAutoRebalancer.INSTANCE_OPERATION_TO_EXCLUDE_FROM_ASSIGNMENT.contains(instanceConfigMap.get(instance).getInstanceOperation()))
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We probably need checks before instanceConfigMap.get()

.collect(Collectors.toSet());
}

/**
* Emergency rebalance is scheduled to quickly handle urgent cases like reassigning partitions from inactive nodes
* and addressing for partitions failing to meet minActiveReplicas.
Expand Down Expand Up @@ -608,7 +618,8 @@ protected boolean requireRebalanceOverwrite(ResourceControllerDataProvider clust
bestPossibleAssignment.values().parallelStream().forEach((resourceAssignment -> {
String resourceName = resourceAssignment.getResourceName();
IdealState currentIdealState = clusterData.getIdealState(resourceName);
Set<String> enabledLiveInstances = clusterData.getEnabledLiveInstances();
Set<String> enabledLiveInstances =
filterOutOnOperationInstances(clusterData.getInstanceConfigMap(), clusterData.getEnabledLiveInstances());
int numReplica = currentIdealState.getReplicaCount(enabledLiveInstances.size());
int minActiveReplica = DelayedRebalanceUtil.getMinActiveReplica(ResourceConfig
.mergeIdealStateWithResourceConfig(clusterData.getResourceConfig(resourceName),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
import org.apache.helix.HelixDataAccessor;
import org.apache.helix.HelixRollbackException;
import org.apache.helix.NotificationContext;
import org.apache.helix.PropertyPathBuilder;
import org.apache.helix.TestHelper;
import org.apache.helix.common.ZkTestBase;
import org.apache.helix.constants.InstanceConstants;
Expand Down Expand Up @@ -89,9 +90,11 @@ public void beforeClass() throws Exception {

ClusterConfig clusterConfig = _configAccessor.getClusterConfig(CLUSTER_NAME);
clusterConfig.stateTransitionCancelEnabled(true);
clusterConfig.setDelayRebalaceEnabled(true);
clusterConfig.setRebalanceDelayTime(1800000L);
_configAccessor.setClusterConfig(CLUSTER_NAME, clusterConfig);

createTestDBs(200);
createTestDBs(1800000L);

setUpWagedBaseline();

Expand Down Expand Up @@ -199,7 +202,7 @@ public void testAddingNodeWithEvacuationTag() throws Exception {
public void testEvacuateAndCancelBeforeBootstrapFinish() throws Exception {
// add a resource where downward state transition is slow
createResourceWithDelayedRebalance(CLUSTER_NAME, "TEST_DB3_DELAYED_CRUSHED", "MasterSlave", PARTITIONS, REPLICA,
REPLICA - 1, 200, CrushEdRebalanceStrategy.class.getName());
REPLICA - 1, 200000, CrushEdRebalanceStrategy.class.getName());
_allDBs.add("TEST_DB3_DELAYED_CRUSHED");
// add a resource where downward state transition is slow
createResourceWithWagedRebalance(CLUSTER_NAME, "TEST_DB4_DELAYED_WAGED", "MasterSlave",
Expand Down Expand Up @@ -338,21 +341,38 @@ public void testMarkEvacuationAfterEMM() throws Exception {

@Test(dependsOnMethods = "testMarkEvacuationAfterEMM")
public void testEvacuationWithOfflineInstancesInCluster() throws Exception {
_participants.get(1).syncStop();
_participants.get(2).syncStop();
_participants.get(3).syncStop();
// wait for converge, and set evacuate on instance 0
Assert.assertTrue(_clusterVerifier.verifyByPolling());

String evacuateInstanceName = _participants.get(0).getInstanceName();
String evacuateInstanceName = _participants.get(_participants.size()-2).getInstanceName();
_gSetupTool.getClusterManagementTool()
.setInstanceOperation(CLUSTER_NAME, evacuateInstanceName, InstanceConstants.InstanceOperation.EVACUATE);

Map<String, IdealState> assignment;
List<String> currentActiveInstances =
_participantNames.stream().filter(n -> (!n.equals(evacuateInstanceName) && !n.equals(_participants.get(3).getInstanceName()))).collect(Collectors.toList());
TestHelper.verify( ()-> {return verifyIS(evacuateInstanceName);}, TestHelper.WAIT_DURATION);
Map<String, ExternalView> assignment;
// EV should contain all participants, check resources one by one
assignment = getEV();
for (String resource : _allDBs) {
ExternalView ev = assignment.get(resource);
for (String partition : ev.getPartitionSet()) {
AtomicInteger activeReplicaCount = new AtomicInteger();
ev.getStateMap(partition)
.values()
.stream()
.filter(
v -> v.equals("MASTER") || v.equals("LEADER") || v.equals("SLAVE") || v.equals("FOLLOWER") || v.equals(
"STANDBY"))
.forEach(v -> activeReplicaCount.getAndIncrement());
Assert.assertTrue(activeReplicaCount.get() >= REPLICA-1);
Assert.assertFalse(ev.getStateMap(partition).containsKey(evacuateInstanceName) && ev.getStateMap(partition)
.get(evacuateInstanceName)
.equals("MASTER") && ev.getStateMap(partition)
.get(evacuateInstanceName)
.equals("LEADER"));

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: empty line

}
}

_participants.get(3).syncStart();
_participants.get(1).syncStart();
_participants.get(2).syncStart();
}

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
package org.apache.helix.rest.common.dataprovider;

/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*
*/

/** Init, register listener and manager callback handler for different
* clusters manage the providers lifecycle
*
*/
Comment on lines +23 to +26
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Wrong place for comments.


import java.util.List;
import org.apache.helix.HelixAdmin;
import org.apache.helix.manager.zk.ZkBaseDataAccessor;
import org.apache.helix.zookeeper.api.client.RealmAwareZkClient;


public class HelixRestDataProviderManager {

protected RealmAwareZkClient _zkclient;

private HelixAdmin _helixAdmin;

private RestServiceDataProvider _restServiceDataProvider;
// list of callback handlers

//TODO: create own zk client
public HelixRestDataProviderManager(RealmAwareZkClient zkclient, HelixAdmin helixAdmin) {
_zkclient = zkclient;
_helixAdmin = helixAdmin;
_restServiceDataProvider = new RestServiceDataProvider();
init();
}

public RestServiceDataProvider getRestServiceDataProvider() {
return _restServiceDataProvider;
}

private void init() {
List<String> clusters = _helixAdmin.getClusters();
for (String cluster : clusters) {
PerClusterDataProvider clusterDataProvider =
new PerClusterDataProvider(cluster, _zkclient, new ZkBaseDataAccessor(_zkclient));
clusterDataProvider.initCache();
// register callback handler for each provider
_restServiceDataProvider.addClusterDataProvider(cluster, clusterDataProvider);
}
}

public void close() {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Close should pair with connect / start. Do you want to make it as thread based class?

}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
package org.apache.helix.rest.common.dataprovider;

/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/

import java.util.Map;
import org.apache.helix.BaseDataAccessor;
import org.apache.helix.HelixDataAccessor;
import org.apache.helix.PropertyKey;
import org.apache.helix.common.caches.PropertyCache;
import org.apache.helix.manager.zk.ZKHelixDataAccessor;
import org.apache.helix.model.ClusterConfig;
import org.apache.helix.model.ClusterConstraints;
import org.apache.helix.model.CurrentState;
import org.apache.helix.model.ExternalView;
import org.apache.helix.model.IdealState;
import org.apache.helix.model.InstanceConfig;
import org.apache.helix.model.LiveInstance;
import org.apache.helix.model.ResourceConfig;
import org.apache.helix.model.StateModelDefinition;
import org.apache.helix.zookeeper.api.client.RealmAwareZkClient;


/**
* Dara cache for each Helix cluster. Configs, ideal stats and current states are read from ZK and updated
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Typo:
Dara -> Data
ideal stats -> ideal states

* using event changes. External view are consolidated using current state.
*/
public class PerClusterDataProvider {

private HelixDataAccessor _accessor;

private RealmAwareZkClient _zkclient;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why not use meta client.


private final String _clusterName;

// Simple caches
private final RestPropertyCache<InstanceConfig> _instanceConfigCache;
private final RestPropertyCache<ClusterConfig> _clusterConfigCache;
private final RestPropertyCache<ResourceConfig> _resourceConfigCache;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we need ResourceConfig cache?

private final RestPropertyCache<LiveInstance> _liveInstanceCache;
private final RestPropertyCache<IdealState> _idealStateCache;
private final RestPropertyCache<StateModelDefinition> _stateModelDefinitionCache;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we need this?


// special caches
private final RestCurrentStateCache _currentStateCache;

// TODO: add external view caches

public PerClusterDataProvider(String clusterName, RealmAwareZkClient zkClient, BaseDataAccessor baseDataAccessor) {
_clusterName = clusterName;
_accessor = new ZKHelixDataAccessor(clusterName, baseDataAccessor);

_zkclient = zkClient;
_instanceConfigCache = null;
_clusterConfigCache = null;
_resourceConfigCache = null;
_liveInstanceCache = null;
_idealStateCache = null;
_stateModelDefinitionCache = null;
_currentStateCache = null;
}
// TODO: consolidate EV from CSs
public Map<String, ExternalView> consolidateExternalViews() {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just to understand, if the stoppable check wants to get the external view from the cache, it should call consolidateExternalViews which will generate EV by the data from RestCurrentStateCache? Is that the plan?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It should just call getEV. consolidateExternalViews will be called internal in this data provider

return null;
}

// Used for dummy cache. Remove later
public void initCache(final HelixDataAccessor accessor) {

}

public void initCache() {

}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
package org.apache.helix.rest.common.dataprovider;

/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/

import java.util.concurrent.ConcurrentHashMap;
import org.apache.helix.HelixDataAccessor;
import org.apache.helix.model.CurrentState;


/**
* Special cache for instances current states.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why not CurrentStateCache instead of building its own?

*
*/
public class RestCurrentStateCache {

//Map<instanceName, Map<ResourceName, CurrentState>>
private ConcurrentHashMap<String, ConcurrentHashMap<String, CurrentState>> _objCache;

public RestCurrentStateCache() {
_objCache = new ConcurrentHashMap<>();
}

public void init(final HelixDataAccessor accessor) {
}
}
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

NIT: extra new line.

Loading
Loading