Skip to content

Commit

Permalink
[IOTDB-4361] Add a precheck in removing datanode (apache#7264)
Browse files Browse the repository at this point in the history
  • Loading branch information
MiniSho authored Sep 14, 2022
1 parent 8f9e28b commit f776aa5
Showing 3 changed files with 58 additions and 6 deletions.
Original file line number Diff line number Diff line change
@@ -29,13 +29,17 @@
import org.apache.iotdb.confignode.client.DataNodeRequestType;
import org.apache.iotdb.confignode.client.async.datanode.AsyncDataNodeClientPool;
import org.apache.iotdb.confignode.client.sync.datanode.SyncDataNodeClientPool;
import org.apache.iotdb.confignode.conf.ConfigNodeConfig;
import org.apache.iotdb.confignode.conf.ConfigNodeDescriptor;
import org.apache.iotdb.confignode.consensus.request.write.RemoveDataNodePlan;
import org.apache.iotdb.confignode.consensus.request.write.UpdateRegionLocationPlan;
import org.apache.iotdb.confignode.consensus.response.DataNodeToStatusResp;
import org.apache.iotdb.confignode.manager.ConfigManager;
import org.apache.iotdb.confignode.manager.load.heartbeat.BaseNodeCache;
import org.apache.iotdb.confignode.persistence.NodeInfo;
import org.apache.iotdb.confignode.procedure.exception.ProcedureException;
import org.apache.iotdb.confignode.procedure.scheduler.LockQueue;
import org.apache.iotdb.consensus.ConsensusFactory;
import org.apache.iotdb.mpp.rpc.thrift.TCreatePeerReq;
import org.apache.iotdb.mpp.rpc.thrift.TDisableDataNodeReq;
import org.apache.iotdb.mpp.rpc.thrift.TMaintainPeerReq;
@@ -53,6 +57,8 @@
public class DataNodeRemoveHandler {
private static final Logger LOGGER = LoggerFactory.getLogger(DataNodeRemoveHandler.class);

private static final ConfigNodeConfig CONF = ConfigNodeDescriptor.getInstance().getConf();

private final ConfigManager configManager;

/** region migrate lock */
@@ -386,14 +392,20 @@ public TSStatus stopDataNode(TDataNodeLocation dataNode) throws ProcedureExcepti
public DataNodeToStatusResp checkRemoveDataNodeRequest(RemoveDataNodePlan removeDataNodePlan) {
DataNodeToStatusResp dataSet = new DataNodeToStatusResp();
dataSet.setStatus(new TSStatus(TSStatusCode.SUCCESS_STATUS.getStatusCode()));
TSStatus status = checkRegionReplication(removeDataNodePlan);
if (status.getCode() != TSStatusCode.SUCCESS_STATUS.getStatusCode()) {

TSStatus status = checkClusterProtocol();
if (isFailed(status)) {
dataSet.setStatus(status);
return dataSet;
}
status = checkRegionReplication(removeDataNodePlan);
if (isFailed(status)) {
dataSet.setStatus(status);
return dataSet;
}

status = checkDataNodeExist(removeDataNodePlan);
if (status.getCode() != TSStatusCode.SUCCESS_STATUS.getStatusCode()) {
if (isFailed(status)) {
dataSet.setStatus(status);
return dataSet;
}
@@ -433,8 +445,31 @@ private TSStatus checkDataNodeExist(RemoveDataNodePlan removeDataNodePlan) {
*/
private TSStatus checkRegionReplication(RemoveDataNodePlan removeDataNodePlan) {
TSStatus status = new TSStatus(TSStatusCode.SUCCESS_STATUS.getStatusCode());
int removedDataNodeSize = removeDataNodePlan.getDataNodeLocations().size();
List<TDataNodeLocation> removedDataNodes = removeDataNodePlan.getDataNodeLocations();
int allDataNodeSize = configManager.getNodeManager().getRegisteredDataNodeCount();

// when the configuration is one replication, it will be failed if the data node is not in
// running state.
if (CONF.getSchemaReplicationFactor() == 1 || CONF.getDataReplicationFactor() == 1) {
for (TDataNodeLocation dataNodeLocation : removedDataNodes) {
// check whether removed data node is in running state
BaseNodeCache nodeCache =
configManager.getNodeManager().getNodeCacheMap().get(dataNodeLocation.getDataNodeId());
if (!nodeCache.getNodeStatus().getStatus().equals("Running")) {
removedDataNodes.remove(dataNodeLocation);
LOGGER.error(
"Failed to remove data node {} because it is not in running and the configuration of cluster is one replication",
dataNodeLocation);
}
if (removedDataNodes.size() == 0) {
status.setCode(TSStatusCode.LACK_REPLICATION.getStatusCode());
status.setMessage("Failed to remove all requested data nodes");
return status;
}
}
}

int removedDataNodeSize = removeDataNodePlan.getDataNodeLocations().size();
if (allDataNodeSize - removedDataNodeSize < NodeInfo.getMinimumDataNode()) {
status.setCode(TSStatusCode.LACK_REPLICATION.getStatusCode());
status.setMessage(
@@ -492,4 +527,21 @@ private Optional<TDataNodeLocation> filterDataNodeWithOtherRegionReplica(
// TODO replace findAny() by select the low load node.
return regionReplicaNodes.stream().filter(e -> !e.equals(filterLocation)).findAny();
}

/**
* Check the protocol of the cluster, standalone is not supported to remove data node currently
*
* @return SUCCEED_STATUS if the Cluster is not standalone protocol, REMOVE_DATANODE_FAILED
* otherwise
*/
private TSStatus checkClusterProtocol() {
TSStatus status = new TSStatus(TSStatusCode.SUCCESS_STATUS.getStatusCode());
if (CONF.getDataRegionConsensusProtocolClass().equals(ConsensusFactory.StandAloneConsensus)
|| CONF.getSchemaRegionConsensusProtocolClass()
.equals(ConsensusFactory.StandAloneConsensus)) {
status.setCode(TSStatusCode.REMOVE_DATANODE_FAILED.getStatusCode());
status.setMessage("standalone protocol is not supported to remove data node");
}
return status;
}
}
Original file line number Diff line number Diff line change
@@ -38,7 +38,6 @@ public static BaseConfig getConfig() {
break;
case "LocalStandaloneOnMpp":
case "Cluster1":
case "Cluster2":
config = new MppConfig();
break;
case "Remote":
Original file line number Diff line number Diff line change
@@ -152,7 +152,8 @@ public enum TSStatusCode {
REGION_MIGRATE_FAILED(915),
LACK_REPLICATION(916),
DATANODE_STOP_ERROR(917),
REGION_LEADER_CHANGE_FAILED(918);
REGION_LEADER_CHANGE_FAILED(918),
REMOVE_DATANODE_FAILED(919);

private int statusCode;

0 comments on commit f776aa5

Please sign in to comment.