From 3af21e398fb75373ea6a8a17b348372f7798379e Mon Sep 17 00:00:00 2001 From: Karthik Kumarguru Date: Fri, 9 Oct 2020 10:55:30 -0700 Subject: [PATCH] Add under utilization related RCAs and decider --- .../deciders/jvm/HeapHealthDecider.java | 6 +- .../jvm/sizing/HeapSizeIncreasePolicy.java | 76 +++++++-- .../metrics/AllMetrics.java | 12 +- .../configs/CpuUnderUtilizedRcaConfig.java | 51 ++++++ .../configs/DiskUnderUtilizedRcaConfig.java | 51 ++++++ .../configs/HeapSizeIncreasePolicyConfig.java | 66 ++++++-- .../rca/framework/api/Resources.java | 6 +- .../api/metrics/Partition_FreeSpace.java | 13 ++ .../api/metrics/Partition_TotalSpace.java | 13 ++ .../metrics/Partition_UsableFreeSpace.java | 13 ++ .../rca/framework/api/metrics/ShardSize.java | 2 +- .../rca/framework/core/GenericContext.java | 5 + .../rca/framework/core/RcaConf.java | 10 ++ .../rca/framework/util/RcaConsts.java | 6 +- .../rca/store/ElasticSearchAnalysisGraph.java | 43 ++++- .../rca/jvmsizing/HighOldGenOccupancyRca.java | 1 - .../ClusterUnderUtilizedRca.java | 63 ++++++++ .../underutilization/CpuUnderUtilizedRca.java | 126 +++++++++++++++ .../DiskUnderUtilizedRca.java | 150 ++++++++++++++++++ .../NodeUnderUtilizedRca.java | 84 ++++++++++ src/main/proto/inter_node_rpc_service.proto | 6 + .../sizing/HeapSizeIncreasePolicyTest.java | 6 +- 22 files changed, 765 insertions(+), 44 deletions(-) create mode 100644 src/main/java/com/amazon/opendistro/elasticsearch/performanceanalyzer/rca/configs/CpuUnderUtilizedRcaConfig.java create mode 100644 src/main/java/com/amazon/opendistro/elasticsearch/performanceanalyzer/rca/configs/DiskUnderUtilizedRcaConfig.java create mode 100644 src/main/java/com/amazon/opendistro/elasticsearch/performanceanalyzer/rca/framework/api/metrics/Partition_FreeSpace.java create mode 100644 src/main/java/com/amazon/opendistro/elasticsearch/performanceanalyzer/rca/framework/api/metrics/Partition_TotalSpace.java create mode 100644 src/main/java/com/amazon/opendistro/elasticsearch/performanceanalyzer/rca/framework/api/metrics/Partition_UsableFreeSpace.java create mode 100644 src/main/java/com/amazon/opendistro/elasticsearch/performanceanalyzer/rca/store/rca/jvmsizing/underutilization/ClusterUnderUtilizedRca.java create mode 100644 src/main/java/com/amazon/opendistro/elasticsearch/performanceanalyzer/rca/store/rca/jvmsizing/underutilization/CpuUnderUtilizedRca.java create mode 100644 src/main/java/com/amazon/opendistro/elasticsearch/performanceanalyzer/rca/store/rca/jvmsizing/underutilization/DiskUnderUtilizedRca.java create mode 100644 src/main/java/com/amazon/opendistro/elasticsearch/performanceanalyzer/rca/store/rca/jvmsizing/underutilization/NodeUnderUtilizedRca.java diff --git a/src/main/java/com/amazon/opendistro/elasticsearch/performanceanalyzer/decisionmaker/deciders/jvm/HeapHealthDecider.java b/src/main/java/com/amazon/opendistro/elasticsearch/performanceanalyzer/decisionmaker/deciders/jvm/HeapHealthDecider.java index 3f0f4348e..4a1bbe3be 100644 --- a/src/main/java/com/amazon/opendistro/elasticsearch/performanceanalyzer/decisionmaker/deciders/jvm/HeapHealthDecider.java +++ b/src/main/java/com/amazon/opendistro/elasticsearch/performanceanalyzer/decisionmaker/deciders/jvm/HeapHealthDecider.java @@ -24,6 +24,7 @@ import com.amazon.opendistro.elasticsearch.performanceanalyzer.rca.framework.core.RcaConf; import com.amazon.opendistro.elasticsearch.performanceanalyzer.rca.store.rca.HighHeapUsageClusterRca; import com.amazon.opendistro.elasticsearch.performanceanalyzer.rca.store.rca.jvmsizing.LargeHeapClusterRca; +import com.amazon.opendistro.elasticsearch.performanceanalyzer.rca.store.rca.jvmsizing.underutilization.ClusterUnderUtilizedRca; import java.util.List; /** @@ -39,12 +40,13 @@ public class HeapHealthDecider extends Decider { private int counter = 0; public HeapHealthDecider(int decisionFrequency, - final HighHeapUsageClusterRca highHeapUsageClusterRca, LargeHeapClusterRca largeHeapClusterRca) { + final HighHeapUsageClusterRca highHeapUsageClusterRca, + LargeHeapClusterRca largeHeapClusterRca, ClusterUnderUtilizedRca underUtilizedRca) { //TODO : refactor parent class to remove evalIntervalSeconds completely super(EVAL_INTERVAL_IN_S, decisionFrequency); oldGenDecisionPolicy = new OldGenDecisionPolicy(highHeapUsageClusterRca); jvmGenTuningPolicy = new JvmGenTuningPolicy(highHeapUsageClusterRca); - heapSizeIncreasePolicy = new HeapSizeIncreasePolicy(largeHeapClusterRca); + heapSizeIncreasePolicy = new HeapSizeIncreasePolicy(largeHeapClusterRca, underUtilizedRca); } @Override diff --git a/src/main/java/com/amazon/opendistro/elasticsearch/performanceanalyzer/decisionmaker/deciders/jvm/sizing/HeapSizeIncreasePolicy.java b/src/main/java/com/amazon/opendistro/elasticsearch/performanceanalyzer/decisionmaker/deciders/jvm/sizing/HeapSizeIncreasePolicy.java index 74dec6532..55a7a159a 100644 --- a/src/main/java/com/amazon/opendistro/elasticsearch/performanceanalyzer/decisionmaker/deciders/jvm/sizing/HeapSizeIncreasePolicy.java +++ b/src/main/java/com/amazon/opendistro/elasticsearch/performanceanalyzer/decisionmaker/deciders/jvm/sizing/HeapSizeIncreasePolicy.java @@ -29,26 +29,33 @@ import com.amazon.opendistro.elasticsearch.performanceanalyzer.rca.framework.util.RcaConsts; import com.amazon.opendistro.elasticsearch.performanceanalyzer.rca.store.rca.cluster.NodeKey; import com.amazon.opendistro.elasticsearch.performanceanalyzer.rca.store.rca.jvmsizing.LargeHeapClusterRca; +import com.amazon.opendistro.elasticsearch.performanceanalyzer.rca.store.rca.jvmsizing.underutilization.ClusterUnderUtilizedRca; import com.google.common.annotations.VisibleForTesting; import java.nio.file.Paths; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; +import java.util.Optional; import javax.annotation.Nonnull; public class HeapSizeIncreasePolicy implements DecisionPolicy { private final LargeHeapClusterRca largeHeapClusterRca; + private final ClusterUnderUtilizedRca clusterUnderUtilizedRca; + private final HeapSizeIncreaseClusterMonitor heapSizeIncreaseContentionClusterMonitor; + private final HeapSizeIncreaseClusterMonitor heapSizeIncreaseUnderUtilizationClusterMonitor; + private AppContext appContext; private RcaConf rcaConf; - private final HeapSizeIncreaseClusterMonitor heapSizeIncreaseClusterMonitor; - private int unhealthyNodePercentage; - public HeapSizeIncreasePolicy(final LargeHeapClusterRca largeHeapClusterRca) { - this.heapSizeIncreaseClusterMonitor = new HeapSizeIncreaseClusterMonitor(); + public HeapSizeIncreasePolicy(final LargeHeapClusterRca largeHeapClusterRca, final + ClusterUnderUtilizedRca clusterUnderUtilizedRca) { + this.heapSizeIncreaseContentionClusterMonitor = new HeapSizeIncreaseClusterMonitor(); + this.heapSizeIncreaseUnderUtilizationClusterMonitor = new HeapSizeIncreaseClusterMonitor(); this.largeHeapClusterRca = largeHeapClusterRca; + this.clusterUnderUtilizedRca = clusterUnderUtilizedRca; } @Override @@ -56,17 +63,37 @@ public List evaluate() { addToClusterMonitor(); List actions = new ArrayList<>(); - if (!heapSizeIncreaseClusterMonitor.isHealthy()) { - Action heapSizeIncreaseAction = new HeapSizeIncreaseAction(appContext); - if (heapSizeIncreaseAction.isActionable()) { - actions.add(heapSizeIncreaseAction); + if (!heapSizeIncreaseContentionClusterMonitor.isHealthy()) { + getHeapSizeIncreaseActionIfActionable().ifPresent(actions::add); + } + + // Since both contention and under utilization add the same action, we don't want to add + // the same action twice. If the action is already added as part of contention, then skip + // checking for under utilization. + if (actions.isEmpty()) { + if (!heapSizeIncreaseUnderUtilizationClusterMonitor.isHealthy()) { + getHeapSizeIncreaseActionIfActionable().ifPresent(actions::add); } } return actions; } + private Optional getHeapSizeIncreaseActionIfActionable() { + final Action heapSizeIncreaseAction = new HeapSizeIncreaseAction(appContext); + if (heapSizeIncreaseAction.isActionable()) { + return Optional.of(heapSizeIncreaseAction); + } + + return Optional.empty(); + } + private void addToClusterMonitor() { + addToContentionClusterMonitor(); + addToUnderUtilizationClusterMonitor(); + } + + private void addToContentionClusterMonitor() { long currTime = System.currentTimeMillis(); if (largeHeapClusterRca.getFlowUnits().isEmpty()) { return; @@ -79,7 +106,27 @@ private void addToClusterMonitor() { List hotNodeSummaries = flowUnit.getSummary().getHotNodeSummaryList(); hotNodeSummaries.forEach(hotNodeSummary -> { NodeKey nodeKey = new NodeKey(hotNodeSummary.getNodeID(), hotNodeSummary.getHostAddress()); - heapSizeIncreaseClusterMonitor.recordIssue(nodeKey, currTime); + heapSizeIncreaseContentionClusterMonitor.recordIssue(nodeKey, currTime); + }); + } + + private void addToUnderUtilizationClusterMonitor() { + long currTime = System.currentTimeMillis(); + if (clusterUnderUtilizedRca.getFlowUnits().isEmpty()) { + return; + } + + final ResourceFlowUnit flowUnit = + clusterUnderUtilizedRca.getFlowUnits().get(0); + + if (!flowUnit.getResourceContext().isUnderUtilized()) { + return; + } + + List hotNodeSummaries = flowUnit.getSummary().getHotNodeSummaryList(); + hotNodeSummaries.forEach(hotNodeSummary -> { + NodeKey key = new NodeKey(hotNodeSummary.getNodeID(), hotNodeSummary.getHostAddress()); + heapSizeIncreaseUnderUtilizationClusterMonitor.recordIssue(key, currTime); }); } @@ -136,9 +183,14 @@ public void setRcaConf(final RcaConf rcaConf) { private void readThresholdValuesFromConf() { HeapSizeIncreasePolicyConfig policyConfig = rcaConf.getJvmScaleUpPolicyConfig(); this.unhealthyNodePercentage = policyConfig.getUnhealthyNodePercentage(); - this.heapSizeIncreaseClusterMonitor.setDayBreachThreshold(policyConfig.getDayBreachThreshold()); - this.heapSizeIncreaseClusterMonitor - .setWeekBreachThreshold(policyConfig.getWeekBreachThreshold()); + this.heapSizeIncreaseContentionClusterMonitor.setDayBreachThreshold(policyConfig.getDayBreachThresholdForContention()); + this.heapSizeIncreaseContentionClusterMonitor + .setWeekBreachThreshold(policyConfig.getWeekBreachThresholdForContention()); + + this.heapSizeIncreaseUnderUtilizationClusterMonitor + .setDayBreachThreshold(policyConfig.getDayBreachThresholdForUnderUtilization()); + this.heapSizeIncreaseUnderUtilizationClusterMonitor + .setWeekBreachThreshold(policyConfig.getWeekBreachThresholdForUnderUtilization()); } @VisibleForTesting diff --git a/src/main/java/com/amazon/opendistro/elasticsearch/performanceanalyzer/metrics/AllMetrics.java b/src/main/java/com/amazon/opendistro/elasticsearch/performanceanalyzer/metrics/AllMetrics.java index 8dd261684..a8b96fd92 100644 --- a/src/main/java/com/amazon/opendistro/elasticsearch/performanceanalyzer/metrics/AllMetrics.java +++ b/src/main/java/com/amazon/opendistro/elasticsearch/performanceanalyzer/metrics/AllMetrics.java @@ -415,7 +415,7 @@ public static class Constants { } } - public enum DevicePartitionDimension implements MetricDimension { + public enum DevicePartitionDimension implements MetricDimension, JooqFieldValue { MOUNT_POINT(Constants.MOUNT_POINT_VALUE), DEVICE_PARTITION(Constants.DEVICE_PARTITION_VALUE); @@ -430,6 +430,16 @@ public String toString() { return this.value; } + @Override + public String getName() { + return this.value; + } + + @Override + public Field getField() { + return DSL.field(DSL.name(this.value), String.class); + } + public static class Constants { public static final String MOUNT_POINT_VALUE = "MountPoint"; diff --git a/src/main/java/com/amazon/opendistro/elasticsearch/performanceanalyzer/rca/configs/CpuUnderUtilizedRcaConfig.java b/src/main/java/com/amazon/opendistro/elasticsearch/performanceanalyzer/rca/configs/CpuUnderUtilizedRcaConfig.java new file mode 100644 index 000000000..ecce6d8d0 --- /dev/null +++ b/src/main/java/com/amazon/opendistro/elasticsearch/performanceanalyzer/rca/configs/CpuUnderUtilizedRcaConfig.java @@ -0,0 +1,51 @@ +/* + * Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"). + * You may not use this file except in compliance with the License. + * A copy of the License is located at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * or in the "license" file accompanying this file. This file is distributed + * on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either + * express or implied. See the License for the specific language governing + * permissions and limitations under the License. + */ + +package com.amazon.opendistro.elasticsearch.performanceanalyzer.rca.configs; + +import com.amazon.opendistro.elasticsearch.performanceanalyzer.rca.framework.core.RcaConf; + +public class CpuUnderUtilizedRcaConfig { + + private static final String CONFIG_NAME = "cpu-underutilized-rca-config"; + private static final double DEFAULT_MIN_CPU_UTILIZATION = 20D; + private final double cpuUtilizationThreshold; + + public CpuUnderUtilizedRcaConfig(final RcaConf rcaConf) { + this.cpuUtilizationThreshold = rcaConf.readRcaConfig(CONFIG_NAME, + CpuUnderUtilizedRcaConfigKeys.MIN_CPU_UTILIZATION_THRESHOLD.toString(), + DEFAULT_MIN_CPU_UTILIZATION, Double.class); + } + + public double getCpuUtilizationThreshold() { + return this.cpuUtilizationThreshold; + } + + enum CpuUnderUtilizedRcaConfigKeys { + MIN_CPU_UTILIZATION_THRESHOLD("min-cpu-utilization-threshold"); + + private String value; + + CpuUnderUtilizedRcaConfigKeys(final String value) { + this.value = value; + } + + + @Override + public String toString() { + return this.value; + } + } +} diff --git a/src/main/java/com/amazon/opendistro/elasticsearch/performanceanalyzer/rca/configs/DiskUnderUtilizedRcaConfig.java b/src/main/java/com/amazon/opendistro/elasticsearch/performanceanalyzer/rca/configs/DiskUnderUtilizedRcaConfig.java new file mode 100644 index 000000000..20b662384 --- /dev/null +++ b/src/main/java/com/amazon/opendistro/elasticsearch/performanceanalyzer/rca/configs/DiskUnderUtilizedRcaConfig.java @@ -0,0 +1,51 @@ +/* + * Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"). + * You may not use this file except in compliance with the License. + * A copy of the License is located at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * or in the "license" file accompanying this file. This file is distributed + * on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either + * express or implied. See the License for the specific language governing + * permissions and limitations under the License. + */ + +package com.amazon.opendistro.elasticsearch.performanceanalyzer.rca.configs; + +import com.amazon.opendistro.elasticsearch.performanceanalyzer.rca.framework.core.RcaConf; + +public class DiskUnderUtilizedRcaConfig { + + private static final String CONFIG_NAME = "disk-under-utilized-config"; + public static final double DEFAULT_SHARD_DISK_SPACE_UTILIZATION_PERCENT = 20D; + private final double shardDiskSpaceUtilizationThreshold; + + public DiskUnderUtilizedRcaConfig(final RcaConf rcaConf) { + this.shardDiskSpaceUtilizationThreshold = rcaConf.readRcaConfig(CONFIG_NAME, + DiskUnderUtilizedConfigKeys.SHARD_DISK_SPACE_UTILIZATION_THRESHOLD.toString(), + DEFAULT_SHARD_DISK_SPACE_UTILIZATION_PERCENT, Double.class); + } + + public double getShardDiskSpaceUtilizationThreshold() { + return shardDiskSpaceUtilizationThreshold; + } + + enum DiskUnderUtilizedConfigKeys { + SHARD_DISK_SPACE_UTILIZATION_THRESHOLD("shard-disk-space-utilization-threshold"); + + private final String value; + + DiskUnderUtilizedConfigKeys(final String value) { + this.value = value; + } + + + @Override + public String toString() { + return this.value; + } + } +} diff --git a/src/main/java/com/amazon/opendistro/elasticsearch/performanceanalyzer/rca/configs/HeapSizeIncreasePolicyConfig.java b/src/main/java/com/amazon/opendistro/elasticsearch/performanceanalyzer/rca/configs/HeapSizeIncreasePolicyConfig.java index b414409a5..ffb7f7dfd 100644 --- a/src/main/java/com/amazon/opendistro/elasticsearch/performanceanalyzer/rca/configs/HeapSizeIncreasePolicyConfig.java +++ b/src/main/java/com/amazon/opendistro/elasticsearch/performanceanalyzer/rca/configs/HeapSizeIncreasePolicyConfig.java @@ -21,29 +21,53 @@ public class HeapSizeIncreasePolicyConfig { private static final String POLICY_NAME = "heap-size-increase-policy"; public static final int DEFAULT_UNHEALTHY_NODE_PERCENTAGE = 50; + public static final int DEFAULT_UNDER_UTILIZED_NODE_PERCENTAGE = 70; public static final int DEFAULT_MIN_UNHEALTHY_MINUTES = 2 * 24 * 60; - private static final int DEFAULT_DAY_BREACH_THRESHOLD = 8; - private static final int DEFAULT_WEEK_BREACH_THRESHOLD = 3; + private static final int DEFAULT_DAY_BREACH_THRESHOLD_CONTENTION = 8; + private static final int DEFAULT_WEEK_BREACH_THRESHOLD_CONTENTION = 3; + + // 24 30m aggregated values - 12hrs of under utilization needed to breach. + private static final int DEFAULT_DAY_BREACH_THRESHOLD_UNDER_UTILIZATION = 24; + + // 4 such days of under utilization needed to breach. + private static final int DEFAULT_WEEK_BREACH_THRESHOLD_UNDER_UTILIZATION = 4; + private final int unhealthyNodePercentage; - private final int dayBreachThreshold; - private final int weekBreachThreshold; + private final int underUtilizedNodePercentage; + private final int dayBreachThresholdForContention; + private final int weekBreachThresholdForContention; + private final int dayBreachThresholdForUnderUtilization; + private final int weekBreachThresholdForUnderUtilization; public HeapSizeIncreasePolicyConfig(final RcaConf rcaConf) { this.unhealthyNodePercentage = rcaConf.readRcaConfig(POLICY_NAME, HeapSizeIncreasePolicyKeys.UNHEALTHY_NODE_PERCENTAGE_KEY.toString(), DEFAULT_UNHEALTHY_NODE_PERCENTAGE, Integer.class); - this.dayBreachThreshold = rcaConf.readRcaConfig(POLICY_NAME, - HeapSizeIncreasePolicyKeys.DAY_BREACH_THRESHOLD_KEY.toString(), DEFAULT_DAY_BREACH_THRESHOLD, + this.underUtilizedNodePercentage = rcaConf.readRcaConfig(POLICY_NAME, + HeapSizeIncreasePolicyKeys.UNDER_UTILIZED_NODE_PERCENTAGE_KEY.toString(), + DEFAULT_UNDER_UTILIZED_NODE_PERCENTAGE, Integer.class); + this.dayBreachThresholdForContention = rcaConf.readRcaConfig(POLICY_NAME, + HeapSizeIncreasePolicyKeys.DAY_BREACH_THRESHOLD_CONTENTION_KEY.toString(), + DEFAULT_DAY_BREACH_THRESHOLD_CONTENTION, Integer.class); - this.weekBreachThreshold = rcaConf - .readRcaConfig(POLICY_NAME, HeapSizeIncreasePolicyKeys.WEEK_BREACH_THRESHOLD_KEY - .toString(), DEFAULT_WEEK_BREACH_THRESHOLD, Integer.class); + this.weekBreachThresholdForContention = rcaConf + .readRcaConfig(POLICY_NAME, HeapSizeIncreasePolicyKeys.WEEK_BREACH_THRESHOLD_CONTENTION_KEY + .toString(), DEFAULT_WEEK_BREACH_THRESHOLD_CONTENTION, Integer.class); + this.dayBreachThresholdForUnderUtilization = rcaConf.readRcaConfig(POLICY_NAME, + HeapSizeIncreasePolicyKeys.DAY_BREACH_THRESHOLD_UNDER_UTILIZATION_KEY.toString(), + DEFAULT_DAY_BREACH_THRESHOLD_UNDER_UTILIZATION, Integer.class); + this.weekBreachThresholdForUnderUtilization = rcaConf.readRcaConfig(POLICY_NAME, + HeapSizeIncreasePolicyKeys.WEEK_BREACH_THRESHOLD_UNDER_UTILIZATION_KEY.toString(), + DEFAULT_WEEK_BREACH_THRESHOLD_UNDER_UTILIZATION, Integer.class); } enum HeapSizeIncreasePolicyKeys { UNHEALTHY_NODE_PERCENTAGE_KEY("unhealthy-node-percentage"), - DAY_BREACH_THRESHOLD_KEY("day-breach-threshold"), - WEEK_BREACH_THRESHOLD_KEY("week-breach-threshold"); + UNDER_UTILIZED_NODE_PERCENTAGE_KEY("under-utilized-node-percentage"), + DAY_BREACH_THRESHOLD_CONTENTION_KEY("day-breach-threshold-contention"), + DAY_BREACH_THRESHOLD_UNDER_UTILIZATION_KEY("day-breach-threshold-under-utilization"), + WEEK_BREACH_THRESHOLD_CONTENTION_KEY("week-breach-threshold-contention"), + WEEK_BREACH_THRESHOLD_UNDER_UTILIZATION_KEY("week-breach-threshold-under-utilization"); private final String value; @@ -61,11 +85,23 @@ public int getUnhealthyNodePercentage() { return unhealthyNodePercentage; } - public int getDayBreachThreshold() { - return dayBreachThreshold; + public int getUnderUtilizedNodePercentage() { + return underUtilizedNodePercentage; + } + + public int getDayBreachThresholdForContention() { + return dayBreachThresholdForContention; + } + + public int getWeekBreachThresholdForContention() { + return weekBreachThresholdForContention; + } + + public int getDayBreachThresholdForUnderUtilization() { + return dayBreachThresholdForUnderUtilization; } - public int getWeekBreachThreshold() { - return weekBreachThreshold; + public int getWeekBreachThresholdForUnderUtilization() { + return weekBreachThresholdForUnderUtilization; } } diff --git a/src/main/java/com/amazon/opendistro/elasticsearch/performanceanalyzer/rca/framework/api/Resources.java b/src/main/java/com/amazon/opendistro/elasticsearch/performanceanalyzer/rca/framework/api/Resources.java index 1acaf3998..c75b5d99f 100644 --- a/src/main/java/com/amazon/opendistro/elasticsearch/performanceanalyzer/rca/framework/api/Resources.java +++ b/src/main/java/com/amazon/opendistro/elasticsearch/performanceanalyzer/rca/framework/api/Resources.java @@ -152,7 +152,8 @@ public enum State { UNHEALTHY(Constants.UNHEALTHY_VALUE), CONTENDED(Constants.CONTENDED_VALUE), STARVED(Constants.STARVED_VALUE), - UNKNOWN(Constants.UNKOWN_VALUE); + UNKNOWN(Constants.UNKNOWN_VALUE), + UNDERUTILIZED(Constants.UNDERUTILIZED_VALUE); private final String value; @@ -171,7 +172,8 @@ public static class Constants { public static final String UNHEALTHY_VALUE = "unhealthy"; public static final String CONTENDED_VALUE = "contended"; public static final String STARVED_VALUE = "starved"; - public static final String UNKOWN_VALUE = "unknown"; + public static final String UNKNOWN_VALUE = "unknown"; + public static final String UNDERUTILIZED_VALUE = "underutilized"; } } } diff --git a/src/main/java/com/amazon/opendistro/elasticsearch/performanceanalyzer/rca/framework/api/metrics/Partition_FreeSpace.java b/src/main/java/com/amazon/opendistro/elasticsearch/performanceanalyzer/rca/framework/api/metrics/Partition_FreeSpace.java new file mode 100644 index 000000000..8d1487e20 --- /dev/null +++ b/src/main/java/com/amazon/opendistro/elasticsearch/performanceanalyzer/rca/framework/api/metrics/Partition_FreeSpace.java @@ -0,0 +1,13 @@ +package com.amazon.opendistro.elasticsearch.performanceanalyzer.rca.framework.api.metrics; + +import com.amazon.opendistro.elasticsearch.performanceanalyzer.metrics.AllMetrics.DevicePartitionValue; +import com.amazon.opendistro.elasticsearch.performanceanalyzer.rca.framework.api.Metric; + +public class Partition_FreeSpace extends Metric { + + public static final String NAME = DevicePartitionValue.FREE_SPACE.toString(); + + public Partition_FreeSpace(long evalIntervalInSeconds) { + super(NAME, evalIntervalInSeconds); + } +} diff --git a/src/main/java/com/amazon/opendistro/elasticsearch/performanceanalyzer/rca/framework/api/metrics/Partition_TotalSpace.java b/src/main/java/com/amazon/opendistro/elasticsearch/performanceanalyzer/rca/framework/api/metrics/Partition_TotalSpace.java new file mode 100644 index 000000000..bd303b8b4 --- /dev/null +++ b/src/main/java/com/amazon/opendistro/elasticsearch/performanceanalyzer/rca/framework/api/metrics/Partition_TotalSpace.java @@ -0,0 +1,13 @@ +package com.amazon.opendistro.elasticsearch.performanceanalyzer.rca.framework.api.metrics; + +import com.amazon.opendistro.elasticsearch.performanceanalyzer.metrics.AllMetrics.DevicePartitionValue; +import com.amazon.opendistro.elasticsearch.performanceanalyzer.rca.framework.api.Metric; + +public class Partition_TotalSpace extends Metric { + + public static final String NAME = DevicePartitionValue.TOTAL_SPACE.toString(); + + public Partition_TotalSpace(long evalIntervalInSeconds) { + super(NAME, evalIntervalInSeconds); + } +} diff --git a/src/main/java/com/amazon/opendistro/elasticsearch/performanceanalyzer/rca/framework/api/metrics/Partition_UsableFreeSpace.java b/src/main/java/com/amazon/opendistro/elasticsearch/performanceanalyzer/rca/framework/api/metrics/Partition_UsableFreeSpace.java new file mode 100644 index 000000000..26cc8a0a6 --- /dev/null +++ b/src/main/java/com/amazon/opendistro/elasticsearch/performanceanalyzer/rca/framework/api/metrics/Partition_UsableFreeSpace.java @@ -0,0 +1,13 @@ +package com.amazon.opendistro.elasticsearch.performanceanalyzer.rca.framework.api.metrics; + +import com.amazon.opendistro.elasticsearch.performanceanalyzer.metrics.AllMetrics.DevicePartitionValue; +import com.amazon.opendistro.elasticsearch.performanceanalyzer.rca.framework.api.Metric; + +public class Partition_UsableFreeSpace extends Metric { + + public static final String NAME = DevicePartitionValue.USABLE_FREE_SPACE.toString(); + + public Partition_UsableFreeSpace(long evalIntervalInSeconds) { + super(NAME, evalIntervalInSeconds); + } +} diff --git a/src/main/java/com/amazon/opendistro/elasticsearch/performanceanalyzer/rca/framework/api/metrics/ShardSize.java b/src/main/java/com/amazon/opendistro/elasticsearch/performanceanalyzer/rca/framework/api/metrics/ShardSize.java index c10e563d0..d752b27d9 100644 --- a/src/main/java/com/amazon/opendistro/elasticsearch/performanceanalyzer/rca/framework/api/metrics/ShardSize.java +++ b/src/main/java/com/amazon/opendistro/elasticsearch/performanceanalyzer/rca/framework/api/metrics/ShardSize.java @@ -28,6 +28,6 @@ public class ShardSize extends Metric { public static final String NAME = AllMetrics.ShardStatsValue.SHARD_SIZE_IN_BYTES.toString(); public ShardSize(long evaluationIntervalSeconds) { - super(AllMetrics.ShardStatsValue.SHARD_SIZE_IN_BYTES.name(), evaluationIntervalSeconds); + super(NAME, evaluationIntervalSeconds); } } diff --git a/src/main/java/com/amazon/opendistro/elasticsearch/performanceanalyzer/rca/framework/core/GenericContext.java b/src/main/java/com/amazon/opendistro/elasticsearch/performanceanalyzer/rca/framework/core/GenericContext.java index 246fd316d..417b23e87 100644 --- a/src/main/java/com/amazon/opendistro/elasticsearch/performanceanalyzer/rca/framework/core/GenericContext.java +++ b/src/main/java/com/amazon/opendistro/elasticsearch/performanceanalyzer/rca/framework/core/GenericContext.java @@ -16,6 +16,7 @@ package com.amazon.opendistro.elasticsearch.performanceanalyzer.rca.framework.core; import com.amazon.opendistro.elasticsearch.performanceanalyzer.rca.framework.api.Resources; +import com.amazon.opendistro.elasticsearch.performanceanalyzer.rca.framework.api.Resources.State; public abstract class GenericContext { private final Resources.State state; @@ -40,6 +41,10 @@ public boolean isUnknown() { return this.state == Resources.State.UNKNOWN; } + public boolean isUnderUtilized() { + return this.state == State.UNDERUTILIZED; + } + @Override public String toString() { return this.state.toString(); diff --git a/src/main/java/com/amazon/opendistro/elasticsearch/performanceanalyzer/rca/framework/core/RcaConf.java b/src/main/java/com/amazon/opendistro/elasticsearch/performanceanalyzer/rca/framework/core/RcaConf.java index 9813e58a3..38fec697e 100644 --- a/src/main/java/com/amazon/opendistro/elasticsearch/performanceanalyzer/rca/framework/core/RcaConf.java +++ b/src/main/java/com/amazon/opendistro/elasticsearch/performanceanalyzer/rca/framework/core/RcaConf.java @@ -21,6 +21,8 @@ import com.amazon.opendistro.elasticsearch.performanceanalyzer.decisionmaker.actions.configs.QueueActionConfig; import com.amazon.opendistro.elasticsearch.performanceanalyzer.decisionmaker.deciders.configs.DeciderConfig; import com.amazon.opendistro.elasticsearch.performanceanalyzer.rca.RcaControllerHelper; +import com.amazon.opendistro.elasticsearch.performanceanalyzer.rca.configs.CpuUnderUtilizedRcaConfig; +import com.amazon.opendistro.elasticsearch.performanceanalyzer.rca.configs.DiskUnderUtilizedRcaConfig; import com.amazon.opendistro.elasticsearch.performanceanalyzer.rca.configs.FieldDataCacheRcaConfig; import com.amazon.opendistro.elasticsearch.performanceanalyzer.rca.configs.HeapSizeIncreasePolicyConfig; import com.amazon.opendistro.elasticsearch.performanceanalyzer.rca.configs.HighHeapUsageOldGenRcaConfig; @@ -236,6 +238,14 @@ public OldGenContendedRcaConfig getOldGenContendedRcaConfig() { return new OldGenContendedRcaConfig(this); } + public CpuUnderUtilizedRcaConfig getCpuUnderUtilizedRcaConfig() { + return new CpuUnderUtilizedRcaConfig(this); + } + + public DiskUnderUtilizedRcaConfig getDiskUnderUtilizedRcaConfig() { + return new DiskUnderUtilizedRcaConfig(this); + } + public T readRcaConfig(String rcaName, String key, T defaultValue, Class clazz) { return readRcaConfig(rcaName, key, defaultValue, (s) -> true, clazz); } diff --git a/src/main/java/com/amazon/opendistro/elasticsearch/performanceanalyzer/rca/framework/util/RcaConsts.java b/src/main/java/com/amazon/opendistro/elasticsearch/performanceanalyzer/rca/framework/util/RcaConsts.java index 69c8b4a4b..11f2a9144 100644 --- a/src/main/java/com/amazon/opendistro/elasticsearch/performanceanalyzer/rca/framework/util/RcaConsts.java +++ b/src/main/java/com/amazon/opendistro/elasticsearch/performanceanalyzer/rca/framework/util/RcaConsts.java @@ -16,10 +16,6 @@ package com.amazon.opendistro.elasticsearch.performanceanalyzer.rca.framework.util; import com.amazon.opendistro.elasticsearch.performanceanalyzer.core.Util; -import com.amazon.opendistro.elasticsearch.performanceanalyzer.rca.framework.api.summaries.HotClusterSummary; -import com.amazon.opendistro.elasticsearch.performanceanalyzer.rca.framework.api.summaries.HotNodeSummary; -import com.amazon.opendistro.elasticsearch.performanceanalyzer.rca.framework.api.summaries.HotResourceSummary; -import com.amazon.opendistro.elasticsearch.performanceanalyzer.rca.store.rca.HighHeapUsageClusterRca; import java.nio.file.Paths; import java.util.concurrent.TimeUnit; @@ -49,6 +45,8 @@ public class RcaConsts { public static final String MUTE_ERROR_METRIC = "MuteError"; public static final String HOT_SHARD_RCA_ERROR_METRIC = "HotShardError"; public static final String WRITE_UPDATED_RCA_CONF_ERROR = "WriteUpdatedRcaConfError"; + public static final String CPU_UU_RCA = "CpuUnderUtilizedRcaError"; + public static final String DISK_UU_RCA = "DiskUnderUtilizedRcaError"; static final String dir = System.getProperty("user.dir"); public static final String TEST_CONFIG_PATH = diff --git a/src/main/java/com/amazon/opendistro/elasticsearch/performanceanalyzer/rca/store/ElasticSearchAnalysisGraph.java b/src/main/java/com/amazon/opendistro/elasticsearch/performanceanalyzer/rca/store/ElasticSearchAnalysisGraph.java index cc3bea4d0..7d400db68 100644 --- a/src/main/java/com/amazon/opendistro/elasticsearch/performanceanalyzer/rca/store/ElasticSearchAnalysisGraph.java +++ b/src/main/java/com/amazon/opendistro/elasticsearch/performanceanalyzer/rca/store/ElasticSearchAnalysisGraph.java @@ -53,15 +53,16 @@ import com.amazon.opendistro.elasticsearch.performanceanalyzer.rca.framework.api.metrics.IO_TotalSyscallRate; import com.amazon.opendistro.elasticsearch.performanceanalyzer.rca.framework.api.metrics.IndexWriter_Memory; import com.amazon.opendistro.elasticsearch.performanceanalyzer.rca.framework.api.metrics.Norms_Memory; +import com.amazon.opendistro.elasticsearch.performanceanalyzer.rca.framework.api.metrics.Partition_TotalSpace; import com.amazon.opendistro.elasticsearch.performanceanalyzer.rca.framework.api.metrics.Points_Memory; import com.amazon.opendistro.elasticsearch.performanceanalyzer.rca.framework.api.metrics.Segments_Memory; +import com.amazon.opendistro.elasticsearch.performanceanalyzer.rca.framework.api.metrics.ShardSize; import com.amazon.opendistro.elasticsearch.performanceanalyzer.rca.framework.api.metrics.StoredFields_Memory; import com.amazon.opendistro.elasticsearch.performanceanalyzer.rca.framework.api.metrics.TermVectors_Memory; import com.amazon.opendistro.elasticsearch.performanceanalyzer.rca.framework.api.metrics.Terms_Memory; import com.amazon.opendistro.elasticsearch.performanceanalyzer.rca.framework.api.metrics.ThreadPool_QueueCapacity; import com.amazon.opendistro.elasticsearch.performanceanalyzer.rca.framework.api.metrics.ThreadPool_RejectedReqs; import com.amazon.opendistro.elasticsearch.performanceanalyzer.rca.framework.api.metrics.VersionMap_Memory; -import com.amazon.opendistro.elasticsearch.performanceanalyzer.rca.framework.api.summaries.HotClusterSummary; import com.amazon.opendistro.elasticsearch.performanceanalyzer.rca.framework.api.summaries.HotNodeSummary; import com.amazon.opendistro.elasticsearch.performanceanalyzer.rca.framework.api.summaries.HotResourceSummary; import com.amazon.opendistro.elasticsearch.performanceanalyzer.rca.framework.core.Node; @@ -98,6 +99,10 @@ import com.amazon.opendistro.elasticsearch.performanceanalyzer.rca.store.rca.jvmsizing.LargeHeapClusterRca; import com.amazon.opendistro.elasticsearch.performanceanalyzer.rca.store.rca.jvmsizing.OldGenContendedRca; import com.amazon.opendistro.elasticsearch.performanceanalyzer.rca.store.rca.jvmsizing.OldGenReclamationRca; +import com.amazon.opendistro.elasticsearch.performanceanalyzer.rca.store.rca.jvmsizing.underutilization.ClusterUnderUtilizedRca; +import com.amazon.opendistro.elasticsearch.performanceanalyzer.rca.store.rca.jvmsizing.underutilization.CpuUnderUtilizedRca; +import com.amazon.opendistro.elasticsearch.performanceanalyzer.rca.store.rca.jvmsizing.underutilization.DiskUnderUtilizedRca; +import com.amazon.opendistro.elasticsearch.performanceanalyzer.rca.store.rca.jvmsizing.underutilization.NodeUnderUtilizedRca; import com.amazon.opendistro.elasticsearch.performanceanalyzer.rca.store.rca.temperature.ClusterTemperatureRca; import com.amazon.opendistro.elasticsearch.performanceanalyzer.rca.store.rca.temperature.NodeTemperatureRca; import com.amazon.opendistro.elasticsearch.performanceanalyzer.rca.store.rca.temperature.dimension.CpuUtilDimensionTemperatureRca; @@ -129,18 +134,27 @@ public void construct() { Metric cpuUtilizationGroupByOperation = new AggregateMetric(1, CPU_Utilization.NAME, AggregateFunction.SUM, MetricsDB.AVG, CommonDimension.OPERATION.toString()); + Metric cpuUtilization = new CPU_Utilization(EVALUATION_INTERVAL_SECONDS); + Metric totalDiskSpace = new Partition_TotalSpace(EVALUATION_INTERVAL_SECONDS); + Metric shardSizeInBytes = new ShardSize(EVALUATION_INTERVAL_SECONDS); heapUsed.addTag(TAG_LOCUS, LOCUS_DATA_MASTER_NODE); gcEvent.addTag(TAG_LOCUS, LOCUS_DATA_MASTER_NODE); heapMax.addTag(TAG_LOCUS, LOCUS_DATA_MASTER_NODE); gc_Collection_Time.addTag(TAG_LOCUS, LOCUS_DATA_MASTER_NODE); cpuUtilizationGroupByOperation.addTag(TAG_LOCUS, LOCUS_DATA_MASTER_NODE); + cpuUtilization.addTag(TAG_LOCUS, LOCUS_DATA_MASTER_NODE); + totalDiskSpace.addTag(TAG_LOCUS, LOCUS_DATA_MASTER_NODE); + shardSizeInBytes.addTag(TAG_LOCUS, LOCUS_DATA_MASTER_NODE); addLeaf(heapUsed); addLeaf(gcEvent); addLeaf(heapMax); addLeaf(gc_Collection_Time); addLeaf(cpuUtilizationGroupByOperation); + addLeaf(cpuUtilization); + addLeaf(totalDiskSpace); + addLeaf(shardSizeInBytes); //add node stats metrics List nodeStatsMetrics = constructNodeStatsMetrics(); @@ -177,6 +191,7 @@ public void construct() { hotNodeClusterRca.addTag(TAG_LOCUS, LOCUS_MASTER_NODE); hotNodeClusterRca.addAllUpstreams(Collections.singletonList(hotJVMNodeRca)); + //128g heap - contention based final HighOldGenOccupancyRca oldGenOccupancyRca = new HighOldGenOccupancyRca(heapMax, heapUsed); oldGenOccupancyRca.addTag(TAG_LOCUS, LOCUS_DATA_MASTER_NODE); oldGenOccupancyRca.addAllUpstreams(Arrays.asList(heapMax, heapUsed)); @@ -196,11 +211,33 @@ public void construct() { largeHeapClusterRca.addAllUpstreams(Collections.singletonList(oldGenContendedRca)); largeHeapClusterRca.addTag(TAG_AGGREGATE_UPSTREAM, LOCUS_DATA_NODE); + // 128g - under-utilization based + final CpuUnderUtilizedRca cpuUnderUtilizedRca = new CpuUnderUtilizedRca(cpuUtilization); + cpuUnderUtilizedRca.addTag(TAG_LOCUS, LOCUS_DATA_MASTER_NODE); + cpuUnderUtilizedRca.addAllUpstreams(Collections.singletonList(cpuUtilization)); + + final DiskUnderUtilizedRca diskUnderUtilizedRca = new DiskUnderUtilizedRca(totalDiskSpace, + shardSizeInBytes); + diskUnderUtilizedRca.addTag(TAG_LOCUS, LOCUS_DATA_MASTER_NODE); + diskUnderUtilizedRca.addAllUpstreams(Arrays.asList(totalDiskSpace, shardSizeInBytes)); + + final NodeUnderUtilizedRca nodeUnderUtilizedRca = + new NodeUnderUtilizedRca(cpuUnderUtilizedRca, diskUnderUtilizedRca); + nodeUnderUtilizedRca.addTag(TAG_LOCUS, LOCUS_DATA_MASTER_NODE); + nodeUnderUtilizedRca.addAllUpstreams(Arrays.asList(cpuUnderUtilizedRca, diskUnderUtilizedRca)); + + final ClusterUnderUtilizedRca clusterUnderUtilizedRca = new ClusterUnderUtilizedRca( + nodeUnderUtilizedRca); + clusterUnderUtilizedRca.addTag(TAG_LOCUS, LOCUS_MASTER_NODE); + clusterUnderUtilizedRca.addTag(TAG_AGGREGATE_UPSTREAM, LOCUS_DATA_NODE); + clusterUnderUtilizedRca.addAllUpstreams(Collections.singletonList(nodeUnderUtilizedRca)); + // Heap Health Decider HeapHealthDecider heapHealthDecider = new HeapHealthDecider(RCA_PERIOD, highHeapUsageClusterRca, - largeHeapClusterRca); + largeHeapClusterRca, clusterUnderUtilizedRca); heapHealthDecider.addTag(TAG_LOCUS, LOCUS_MASTER_NODE); - heapHealthDecider.addAllUpstreams(Collections.singletonList(highHeapUsageClusterRca)); + heapHealthDecider.addAllUpstreams(Arrays.asList(highHeapUsageClusterRca, + largeHeapClusterRca, clusterUnderUtilizedRca)); /* Queue Rejection RCAs */ diff --git a/src/main/java/com/amazon/opendistro/elasticsearch/performanceanalyzer/rca/store/rca/jvmsizing/HighOldGenOccupancyRca.java b/src/main/java/com/amazon/opendistro/elasticsearch/performanceanalyzer/rca/store/rca/jvmsizing/HighOldGenOccupancyRca.java index 9737681d7..4849bda91 100644 --- a/src/main/java/com/amazon/opendistro/elasticsearch/performanceanalyzer/rca/store/rca/jvmsizing/HighOldGenOccupancyRca.java +++ b/src/main/java/com/amazon/opendistro/elasticsearch/performanceanalyzer/rca/store/rca/jvmsizing/HighOldGenOccupancyRca.java @@ -35,7 +35,6 @@ public class HighOldGenOccupancyRca extends OldGenRca> { + private static final long EVAL_INTERVAL_IN_S = 5; + + private final Rca> nodeUnderUtilizedRca; + + public ClusterUnderUtilizedRca(final Rca> nodeUnderUtilizedRca) { + super(EVAL_INTERVAL_IN_S); + this.nodeUnderUtilizedRca = nodeUnderUtilizedRca; + } + + @Override + public void generateFlowUnitListFromWire(FlowUnitOperationArgWrapper args) { + throw new UnsupportedOperationException("generateFlowUnitFromWire is not supported on this " + + "node-local RCA: " + args.getNode().name()); + } + + @Override + public ResourceFlowUnit operate() { + long currTime = System.currentTimeMillis(); + List> nodeUnderUtilizedFlowUnits = nodeUnderUtilizedRca + .getFlowUnits(); + + List underUtilizedNodeSummaries = new ArrayList<>(); + for (final ResourceFlowUnit flowUnit : nodeUnderUtilizedFlowUnits) { + if (flowUnit.isEmpty()) { + continue; + } + + if (flowUnit.getResourceContext().isUnderUtilized()) { + underUtilizedNodeSummaries.add(flowUnit.getSummary()); + } + } + + if (underUtilizedNodeSummaries.isEmpty()) { + return new ResourceFlowUnit<>(currTime); + } + + final HotClusterSummary clusterSummary = new HotClusterSummary( + getAppContext().getAllClusterInstances().size(), + underUtilizedNodeSummaries.stream().map(HotNodeSummary::getNodeID).collect( + Collectors.toSet()).size()); + + for (final HotNodeSummary underUtilizedNodeSummary : underUtilizedNodeSummaries) { + clusterSummary.appendNestedSummary(underUtilizedNodeSummary); + } + + ResourceContext context = new ResourceContext(State.UNDERUTILIZED); + return new ResourceFlowUnit<>(currTime, context, clusterSummary); + } +} diff --git a/src/main/java/com/amazon/opendistro/elasticsearch/performanceanalyzer/rca/store/rca/jvmsizing/underutilization/CpuUnderUtilizedRca.java b/src/main/java/com/amazon/opendistro/elasticsearch/performanceanalyzer/rca/store/rca/jvmsizing/underutilization/CpuUnderUtilizedRca.java new file mode 100644 index 000000000..f82a4dfef --- /dev/null +++ b/src/main/java/com/amazon/opendistro/elasticsearch/performanceanalyzer/rca/store/rca/jvmsizing/underutilization/CpuUnderUtilizedRca.java @@ -0,0 +1,126 @@ +package com.amazon.opendistro.elasticsearch.performanceanalyzer.rca.store.rca.jvmsizing.underutilization; + +import static com.amazon.opendistro.elasticsearch.performanceanalyzer.rca.framework.util.RcaConsts.CPU_UU_RCA; + +import com.amazon.opendistro.elasticsearch.performanceanalyzer.collectors.StatsCollector; +import com.amazon.opendistro.elasticsearch.performanceanalyzer.metricsdb.MetricsDB; +import com.amazon.opendistro.elasticsearch.performanceanalyzer.rca.configs.CpuUnderUtilizedRcaConfig; +import com.amazon.opendistro.elasticsearch.performanceanalyzer.rca.framework.api.Metric; +import com.amazon.opendistro.elasticsearch.performanceanalyzer.rca.framework.api.Rca; +import com.amazon.opendistro.elasticsearch.performanceanalyzer.rca.framework.api.Resources.State; +import com.amazon.opendistro.elasticsearch.performanceanalyzer.rca.framework.api.aggregators.SlidingWindow; +import com.amazon.opendistro.elasticsearch.performanceanalyzer.rca.framework.api.aggregators.SlidingWindowData; +import com.amazon.opendistro.elasticsearch.performanceanalyzer.rca.framework.api.contexts.ResourceContext; +import com.amazon.opendistro.elasticsearch.performanceanalyzer.rca.framework.api.flow_units.MetricFlowUnit; +import com.amazon.opendistro.elasticsearch.performanceanalyzer.rca.framework.api.flow_units.ResourceFlowUnit; +import com.amazon.opendistro.elasticsearch.performanceanalyzer.rca.framework.api.summaries.HotResourceSummary; +import com.amazon.opendistro.elasticsearch.performanceanalyzer.rca.framework.api.summaries.ResourceUtil; +import com.amazon.opendistro.elasticsearch.performanceanalyzer.rca.framework.core.RcaConf; +import com.amazon.opendistro.elasticsearch.performanceanalyzer.rca.scheduler.FlowUnitOperationArgWrapper; +import java.util.List; +import java.util.concurrent.TimeUnit; +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; +import org.jooq.Record; +import org.jooq.Result; + +public class CpuUnderUtilizedRca extends Rca> { + + private static final Logger LOG = LogManager.getLogger(CpuUnderUtilizedRca.class); + private static final long EVAL_INTERVAL_IN_S = 5; + private static final long DEFAULT_RCA_PERIOD = 60; + private static final double DEFAULT_UPPER_BOUND = 25D; + private final Metric cpuUtilization; + private final long rcaSamplesBeforeEval; + private final int numCores; + private long samples; + private double cpuUtilizationUpperBound; + + private final SlidingWindow cpuUtilizationSlidingWindow; + + public CpuUnderUtilizedRca(final Metric cpuUtilization) { + this(cpuUtilization, DEFAULT_RCA_PERIOD); + } + + public CpuUnderUtilizedRca(final Metric cpuUtilization, final long rcaEvaluationIntervalInS) { + super(EVAL_INTERVAL_IN_S); + this.numCores = Runtime.getRuntime().availableProcessors(); + this.cpuUtilization = cpuUtilization; + rcaSamplesBeforeEval = rcaEvaluationIntervalInS / EVAL_INTERVAL_IN_S; + this.cpuUtilizationSlidingWindow = new SlidingWindow<>((int) rcaEvaluationIntervalInS, + TimeUnit.MINUTES); + this.samples = 0; + this.cpuUtilizationUpperBound = DEFAULT_UPPER_BOUND; + } + + @Override + public void generateFlowUnitListFromWire(FlowUnitOperationArgWrapper args) { + throw new UnsupportedOperationException("generateFlowUnitListFromWire should not be called " + + "for node-local rca: " + args.getNode().name()); + } + + @Override + public ResourceFlowUnit operate() { + samples++; + addToSlidingWindow(); + if (samples == rcaSamplesBeforeEval) { + samples = 0; + return evaluateAndEmit(); + } + + return new ResourceFlowUnit<>(System.currentTimeMillis()); + } + + private void addToSlidingWindow() { + long currTime = System.currentTimeMillis(); + double totalCpuUtilization = (getTotalCpuUtilization() / numCores) * 100D; + + cpuUtilizationSlidingWindow.next(new SlidingWindowData(currTime, totalCpuUtilization)); + } + + private double getTotalCpuUtilization() { + List metricFlowUnits = cpuUtilization.getFlowUnits(); + double totalUtilization = 0; + for (final MetricFlowUnit flowUnit : metricFlowUnits) { + if (flowUnit.isEmpty()) { + continue; + } + + Result records = flowUnit.getData(); + for (final Record record : records) { + try { + Double usage = record.getValue(MetricsDB.MAX, Double.class); + if (!Double.isNaN(usage)) { + totalUtilization += usage; + } + } catch (Exception e) { + StatsCollector.instance().logMetric(CPU_UU_RCA); + LOG.error("Filed to parse metric in FlowUnit: {} from {}", record, cpuUtilization.name()); + } + } + } + + return totalUtilization; + } + + private ResourceFlowUnit evaluateAndEmit() { + long currTime = System.currentTimeMillis(); + double averageCpuUtilization = cpuUtilizationSlidingWindow.readAvg(); + HotResourceSummary summary = new HotResourceSummary(ResourceUtil.CPU_USAGE, + cpuUtilizationUpperBound, averageCpuUtilization, (int) rcaSamplesBeforeEval); + + ResourceContext context; + if (averageCpuUtilization < cpuUtilizationUpperBound) { + context = new ResourceContext(State.UNDERUTILIZED); + } else { + context = new ResourceContext(State.HEALTHY); + } + return new ResourceFlowUnit<>(currTime, context, summary); + } + + @Override + public void readRcaConf(RcaConf rcaConf) { + final CpuUnderUtilizedRcaConfig config = rcaConf.getCpuUnderUtilizedRcaConfig(); + this.cpuUtilizationUpperBound = config.getCpuUtilizationThreshold(); + } +} diff --git a/src/main/java/com/amazon/opendistro/elasticsearch/performanceanalyzer/rca/store/rca/jvmsizing/underutilization/DiskUnderUtilizedRca.java b/src/main/java/com/amazon/opendistro/elasticsearch/performanceanalyzer/rca/store/rca/jvmsizing/underutilization/DiskUnderUtilizedRca.java new file mode 100644 index 000000000..b42d3a264 --- /dev/null +++ b/src/main/java/com/amazon/opendistro/elasticsearch/performanceanalyzer/rca/store/rca/jvmsizing/underutilization/DiskUnderUtilizedRca.java @@ -0,0 +1,150 @@ +package com.amazon.opendistro.elasticsearch.performanceanalyzer.rca.store.rca.jvmsizing.underutilization; + +import static com.amazon.opendistro.elasticsearch.performanceanalyzer.rca.framework.util.RcaConsts.DISK_UU_RCA; + +import com.amazon.opendistro.elasticsearch.performanceanalyzer.collectors.StatsCollector; +import com.amazon.opendistro.elasticsearch.performanceanalyzer.core.Util; +import com.amazon.opendistro.elasticsearch.performanceanalyzer.metrics.AllMetrics.DevicePartitionDimension; +import com.amazon.opendistro.elasticsearch.performanceanalyzer.metricsdb.MetricsDB; +import com.amazon.opendistro.elasticsearch.performanceanalyzer.rca.configs.DiskUnderUtilizedRcaConfig; +import com.amazon.opendistro.elasticsearch.performanceanalyzer.rca.framework.api.Metric; +import com.amazon.opendistro.elasticsearch.performanceanalyzer.rca.framework.api.Rca; +import com.amazon.opendistro.elasticsearch.performanceanalyzer.rca.framework.api.Resources.State; +import com.amazon.opendistro.elasticsearch.performanceanalyzer.rca.framework.api.contexts.ResourceContext; +import com.amazon.opendistro.elasticsearch.performanceanalyzer.rca.framework.api.flow_units.MetricFlowUnit; +import com.amazon.opendistro.elasticsearch.performanceanalyzer.rca.framework.api.flow_units.ResourceFlowUnit; +import com.amazon.opendistro.elasticsearch.performanceanalyzer.rca.framework.api.persist.SQLParsingUtil; +import com.amazon.opendistro.elasticsearch.performanceanalyzer.rca.framework.api.summaries.HotResourceSummary; +import com.amazon.opendistro.elasticsearch.performanceanalyzer.rca.framework.api.summaries.ResourceUtil; +import com.amazon.opendistro.elasticsearch.performanceanalyzer.rca.framework.core.RcaConf; +import com.amazon.opendistro.elasticsearch.performanceanalyzer.rca.scheduler.FlowUnitOperationArgWrapper; +import java.io.IOException; +import java.nio.file.FileStore; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.util.List; +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; +import org.jooq.Record; +import org.jooq.Result; + +public class DiskUnderUtilizedRca extends Rca> { + + private static final Logger LOG = LogManager.getLogger(DiskUnderUtilizedRca.class); + private static final long EVAL_INTERVAL_IN_S = 5; + private final Metric totalSpaceMetric; + private final Metric shardSizeInBytes; + private double shardSpaceUtilizationThreshold; + + public DiskUnderUtilizedRca(final Metric totalSpaceMetric, final Metric shardSizeInBytes) { + this(totalSpaceMetric, shardSizeInBytes, + DiskUnderUtilizedRcaConfig.DEFAULT_SHARD_DISK_SPACE_UTILIZATION_PERCENT); + } + + public DiskUnderUtilizedRca(final Metric totalSpaceMetric, final Metric shardSizeInBytes, + final double shardSpaceUtilizationThreshold) { + super(EVAL_INTERVAL_IN_S); + this.totalSpaceMetric = totalSpaceMetric; + this.shardSizeInBytes = shardSizeInBytes; + this.shardSpaceUtilizationThreshold = shardSpaceUtilizationThreshold; + } + + @Override + public void generateFlowUnitListFromWire(FlowUnitOperationArgWrapper args) { + throw new UnsupportedOperationException("generateFlowUnitListFromWire should not be called " + + "for node-local rca: " + args.getNode().name()); + } + + @Override + public ResourceFlowUnit operate() { + long currTime = System.currentTimeMillis(); + double totalPartitionSpace = getTotalPartitionSpace(); + double totalShardSize = getTotalShardSize(); + double diskSpaceUsed = (totalShardSize / totalPartitionSpace) * 100D; + + if (diskSpaceUsed < shardSpaceUtilizationThreshold) { + // cumulative shard size on disk is less than the threshold. So, classify this as + // underutilized disk. We also don't need to maintain a sliding window as shard size on + // disk is not a metric to vary(decrease - increase - decrease) by huge margins minute to + // minute to affect the under-utilization calculation. We can treat the current snapshot to + // be the representative value for the whole of the evaluation period of the node level rca. + + ResourceContext context = new ResourceContext(State.UNDERUTILIZED); + HotResourceSummary summary = new HotResourceSummary(ResourceUtil.CPU_USAGE, + shardSpaceUtilizationThreshold, diskSpaceUsed, (int) EVAL_INTERVAL_IN_S); + + return new ResourceFlowUnit<>(currTime, context, summary); + } + + return new ResourceFlowUnit<>(currTime, new ResourceContext(State.HEALTHY), + new HotResourceSummary(ResourceUtil.CPU_USAGE, shardSpaceUtilizationThreshold, + diskSpaceUsed, (int) EVAL_INTERVAL_IN_S)); + } + + private double getTotalShardSize() { + final List metricFlowUnits = shardSizeInBytes.getFlowUnits(); + double totalSpaceUsedByShards = 0; + for (MetricFlowUnit metricFlowUnit : metricFlowUnits) { + if (metricFlowUnit.isEmpty()) { + continue; + } + + Result records = metricFlowUnit.getData(); + for (final Record record : records) { + try { + Double usage = record.getValue(MetricsDB.MAX, Double.class); + if (!Double.isNaN(usage)) { + totalSpaceUsedByShards += usage; + } + } catch (Exception e) { + StatsCollector.instance().logMetric(DISK_UU_RCA); + LOG.error("Failed to parse metric in FlowUnit: {} from {}", record, + shardSizeInBytes.name(), e); + } + } + } + + return totalSpaceUsedByShards; + } + + private double getTotalPartitionSpace() { + final Path dataDirPath = Paths.get(Util.DATA_DIR); + if (Files.exists(dataDirPath)) { + // traverse up the file system to get to the mount point. + try { + FileStore fileStore = Files.getFileStore(dataDirPath); + String partition = fileStore.name(); + double partitionSizeInBytes = 0d; + final List metricFlowUnits = totalSpaceMetric.getFlowUnits(); + for (final MetricFlowUnit metricFlowUnit : metricFlowUnits) { + if (metricFlowUnit.isEmpty()) { + continue; + } + + partitionSizeInBytes = SQLParsingUtil.readDataFromSqlResult(metricFlowUnit.getData(), + DevicePartitionDimension.DEVICE_PARTITION.getField(), partition, MetricsDB.MAX); + + if (!Double.isNaN(partitionSizeInBytes) && partitionSizeInBytes > 0) { + break; + } + } + return partitionSizeInBytes; + } catch (IOException e) { + StatsCollector.instance().logMetric(DISK_UU_RCA); + LOG.error("Couldn't get the FileStore for the for data directory: {}", + dataDirPath.toString(), e); + return Double.NaN; + } + } else { + throw new IllegalStateException("Data directory path specified: " + Util.DATA_DIR + + " is not a valid file or a directory."); + } + } + + @Override + public void readRcaConf(RcaConf conf) { + final DiskUnderUtilizedRcaConfig config = conf.getDiskUnderUtilizedRcaConfig(); + this.shardSpaceUtilizationThreshold = config.getShardDiskSpaceUtilizationThreshold(); + } +} diff --git a/src/main/java/com/amazon/opendistro/elasticsearch/performanceanalyzer/rca/store/rca/jvmsizing/underutilization/NodeUnderUtilizedRca.java b/src/main/java/com/amazon/opendistro/elasticsearch/performanceanalyzer/rca/store/rca/jvmsizing/underutilization/NodeUnderUtilizedRca.java new file mode 100644 index 000000000..9e80f87f9 --- /dev/null +++ b/src/main/java/com/amazon/opendistro/elasticsearch/performanceanalyzer/rca/store/rca/jvmsizing/underutilization/NodeUnderUtilizedRca.java @@ -0,0 +1,84 @@ +package com.amazon.opendistro.elasticsearch.performanceanalyzer.rca.store.rca.jvmsizing.underutilization; + +import com.amazon.opendistro.elasticsearch.performanceanalyzer.grpc.FlowUnitMessage; +import com.amazon.opendistro.elasticsearch.performanceanalyzer.rca.framework.api.Rca; +import com.amazon.opendistro.elasticsearch.performanceanalyzer.rca.framework.api.Resources.State; +import com.amazon.opendistro.elasticsearch.performanceanalyzer.rca.framework.api.contexts.ResourceContext; +import com.amazon.opendistro.elasticsearch.performanceanalyzer.rca.framework.api.flow_units.ResourceFlowUnit; +import com.amazon.opendistro.elasticsearch.performanceanalyzer.rca.framework.api.summaries.HotNodeSummary; +import com.amazon.opendistro.elasticsearch.performanceanalyzer.rca.framework.api.summaries.HotResourceSummary; +import com.amazon.opendistro.elasticsearch.performanceanalyzer.rca.framework.util.InstanceDetails; +import com.amazon.opendistro.elasticsearch.performanceanalyzer.rca.scheduler.FlowUnitOperationArgWrapper; +import java.util.List; +import java.util.function.Function; +import java.util.stream.Collectors; +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; + +public class NodeUnderUtilizedRca extends Rca> { + + private static final Logger LOG = LogManager.getLogger(NodeUnderUtilizedRca.class); + private static final long EVAL_INTERVAL_IN_S = 5; + + private final Rca> cpuUnderUtilizedRca; + private final Rca> diskUnderUtilizedRca; + + public NodeUnderUtilizedRca(final Rca> cpuUnderUtilizedRca, + final Rca> diskUnderUtilizedRca) { + super(EVAL_INTERVAL_IN_S); + this.cpuUnderUtilizedRca = cpuUnderUtilizedRca; + this.diskUnderUtilizedRca = diskUnderUtilizedRca; + } + + @Override + public void generateFlowUnitListFromWire(FlowUnitOperationArgWrapper args) { + List flowUnitMessages = args.getWireHopper().readFromWire(args.getNode()); + + setFlowUnits(flowUnitMessages.stream() + .map((Function>) + ResourceFlowUnit::buildFlowUnitFromWrapper) + .collect(Collectors.toList())); + } + + @Override + public ResourceFlowUnit operate() { + long currTime = System.currentTimeMillis(); + List> cpuUnderUtilizedFlowUnits = cpuUnderUtilizedRca + .getFlowUnits(); + List> diskUnderUtilizedFlowUnits = diskUnderUtilizedRca + .getFlowUnits(); + + // only one flow unit is expected for both the RCAs as they all have the same evaluation + // frequency. + + if (cpuUnderUtilizedFlowUnits.size() != 1 || diskUnderUtilizedFlowUnits.size() != 1) { + LOG.warn("Was expecting both CPUUnderUtilizedRca and DiskUnderUtilizedRca to have exactly " + + "one flowunit. Found: " + cpuUnderUtilizedFlowUnits.size() + ", and " + + diskUnderUtilizedFlowUnits.size() + " respectively"); + return new ResourceFlowUnit<>(currTime); + } + + final ResourceFlowUnit cpuUnderUtilizedFlowUnit = + cpuUnderUtilizedFlowUnits.get(0); + final ResourceFlowUnit diskUnderUtilizedFlowUnit = + diskUnderUtilizedFlowUnits.get(0); + + if (!cpuUnderUtilizedFlowUnit.isEmpty() && !diskUnderUtilizedFlowUnit.isEmpty()) { + boolean isCpuUnderUtilized = cpuUnderUtilizedFlowUnit.getResourceContext().isUnderUtilized(); + boolean isDiskUnderUtilized = diskUnderUtilizedFlowUnit.getResourceContext().isUnderUtilized(); + + if (isCpuUnderUtilized && isDiskUnderUtilized) { + InstanceDetails instanceDetails = getAppContext().getMyInstanceDetails(); + HotNodeSummary summary = new HotNodeSummary(instanceDetails.getInstanceId(), + instanceDetails.getInstanceIp()); + summary.appendNestedSummary(cpuUnderUtilizedFlowUnit.getSummary()); + summary.appendNestedSummary(diskUnderUtilizedFlowUnit.getSummary()); + + ResourceContext context = new ResourceContext(State.UNDERUTILIZED); + return new ResourceFlowUnit<>(currTime, context, summary); + } + } + + return new ResourceFlowUnit<>(currTime); + } +} diff --git a/src/main/proto/inter_node_rpc_service.proto b/src/main/proto/inter_node_rpc_service.proto index 936955b62..125bfb854 100644 --- a/src/main/proto/inter_node_rpc_service.proto +++ b/src/main/proto/inter_node_rpc_service.proto @@ -77,6 +77,9 @@ enum ResourceEnum { // Heap HEAP = 20 [(additional_fields).name = "heap"]; + + // Hardware Contd. + DISK = 31 [(additional_fields).name = "disk"]; } enum MetricEnum { @@ -106,6 +109,9 @@ enum MetricEnum { OLD_GEN_USAGE_AFTER_FULL_GC = 31 [(additional_fields).name = "full gc", (additional_fields).description = "old gen usage after full gc in mb"]; // GC FULL_GC = 32 [(additional_fields).name = "full gc", (additional_fields).description = "full gc pause time in ms"]; + + // Hardware Contd. + DISK_SPACE_USAGE = 48 [(additional_fields).name = "disk usage", (additional_fields).description = "disk space used"]; } /* diff --git a/src/test/java/com/amazon/opendistro/elasticsearch/performanceanalyzer/decisionmaker/deciders/jvm/sizing/HeapSizeIncreasePolicyTest.java b/src/test/java/com/amazon/opendistro/elasticsearch/performanceanalyzer/decisionmaker/deciders/jvm/sizing/HeapSizeIncreasePolicyTest.java index a0fcaaf47..311043cc8 100644 --- a/src/test/java/com/amazon/opendistro/elasticsearch/performanceanalyzer/decisionmaker/deciders/jvm/sizing/HeapSizeIncreasePolicyTest.java +++ b/src/test/java/com/amazon/opendistro/elasticsearch/performanceanalyzer/decisionmaker/deciders/jvm/sizing/HeapSizeIncreasePolicyTest.java @@ -85,7 +85,7 @@ public void setUp() throws Exception { setupDataNodes(); setupMockAppContext(); setupMockRcaConf(); - testPolicy = new HeapSizeIncreasePolicy(mockLargeHeapClusterRca); + testPolicy = new HeapSizeIncreasePolicy(mockLargeHeapClusterRca, null); testPolicy.setAppContext(mockAppContext); testPolicy.setRcaConf(mockRcaConf); } @@ -148,8 +148,8 @@ private void setupMockAppContext() { private void setupMockRcaConf() { when(mockRcaConf.getJvmScaleUpPolicyConfig()).thenReturn(config); when(config.getUnhealthyNodePercentage()).thenReturn(UNHEALTHY_NODE_PERCENTAGE); - when(config.getDayBreachThreshold()).thenReturn(DAY_BREACH); - when(config.getWeekBreachThreshold()).thenReturn(WEEK_BREACH); + when(config.getDayBreachThresholdForContention()).thenReturn(DAY_BREACH); + when(config.getWeekBreachThresholdForContention()).thenReturn(WEEK_BREACH); } private void evalAt(Instant currentInstant) {