From 46bd63e4079d03254767cec6a0710eaf486a4f6b Mon Sep 17 00:00:00 2001 From: CoderJeffrey Date: Thu, 22 Jun 2023 11:29:11 -0700 Subject: [PATCH 01/73] Remove log files and add DCO (Signed-off-by: Jeffrey Liu ujeffliu@amazon.com) Signed-off-by: CoderJeffrey --- .../model/MetricAttributes.java | 1 - .../model/MetricsModel.java | 5 + .../performanceanalyzer/rca/Version.java | 7 +- .../api/metrics/SearchBackPressureStats.java | 15 ++ .../rca/framework/metrics/ReaderMetrics.java | 8 +- .../rca/store/OpenSearchAnalysisGraph.java | 3 +- .../reader/MetricsEmitter.java | 114 ++++++++++ .../reader/ReaderMetricsProcessor.java | 23 ++ .../SearchBackPressureMetricsProcessor.java | 197 ++++++++++++++++++ .../SearchBackPressureMetricsSnapShot.java | 179 ++++++++++++++++ ...earchBackPressureMetricsProcessorTest.java | 161 ++++++++++++++ ...SearchBackPressureMetricsSnapShotTest.java | 136 ++++++++++++ 12 files changed, 844 insertions(+), 5 deletions(-) create mode 100644 src/main/java/org/opensearch/performanceanalyzer/rca/framework/api/metrics/SearchBackPressureStats.java create mode 100644 src/main/java/org/opensearch/performanceanalyzer/reader/SearchBackPressureMetricsProcessor.java create mode 100644 src/main/java/org/opensearch/performanceanalyzer/reader/SearchBackPressureMetricsSnapShot.java create mode 100644 src/test/java/org/opensearch/performanceanalyzer/reader/SearchBackPressureMetricsProcessorTest.java create mode 100644 src/test/java/org/opensearch/performanceanalyzer/reader/SearchBackPressureMetricsSnapShotTest.java diff --git a/src/main/java/org/opensearch/performanceanalyzer/model/MetricAttributes.java b/src/main/java/org/opensearch/performanceanalyzer/model/MetricAttributes.java index 36230b8e5..414e266b7 100644 --- a/src/main/java/org/opensearch/performanceanalyzer/model/MetricAttributes.java +++ b/src/main/java/org/opensearch/performanceanalyzer/model/MetricAttributes.java @@ -14,7 +14,6 @@ public class MetricAttributes { public HashSet dimensionNames; MetricAttributes(String unit, MetricDimension[] dimensions) { - this.unit = unit; this.dimensionNames = new HashSet(); for (MetricDimension dimension : dimensions) { diff --git a/src/main/java/org/opensearch/performanceanalyzer/model/MetricsModel.java b/src/main/java/org/opensearch/performanceanalyzer/model/MetricsModel.java index 5b144ac12..f7c781ac1 100644 --- a/src/main/java/org/opensearch/performanceanalyzer/model/MetricsModel.java +++ b/src/main/java/org/opensearch/performanceanalyzer/model/MetricsModel.java @@ -464,6 +464,11 @@ public class MetricsModel { MetricUnits.MILLISECOND.toString(), AllMetrics.ShardIndexingPressureDimension.values())); + // Search Back Pressure Metrics + allMetricsInitializer.put( + AllMetrics.SearchBackPressureStatsValue.SEARCHBP_SHARD_STATS_CANCELLATIONCOUNT + .toString(), + new MetricAttributes(MetricUnits.COUNT.toString(), EmptyDimension.values())); ALL_METRICS = Collections.unmodifiableMap(allMetricsInitializer); } } diff --git a/src/main/java/org/opensearch/performanceanalyzer/rca/Version.java b/src/main/java/org/opensearch/performanceanalyzer/rca/Version.java index bfc85fcd3..ac53b4d72 100644 --- a/src/main/java/org/opensearch/performanceanalyzer/rca/Version.java +++ b/src/main/java/org/opensearch/performanceanalyzer/rca/Version.java @@ -22,8 +22,11 @@ public final class Version { * Note: The RCA version is agnostic of OpenSearch version. */ static final class Major { - // Bumping this post the Commons Lib(https://github.com/opensearch-project/performance-analyzer-commons/issues/2) - // and Service Metrics(https://github.com/opensearch-project/performance-analyzer-commons/issues/8) change + // Bumping this post the Commons + // Lib(https://github.com/opensearch-project/performance-analyzer-commons/issues/2) + // and Service + // Metrics(https://github.com/opensearch-project/performance-analyzer-commons/issues/8) + // change static final int RCA_MAJ_VERSION = 1; } diff --git a/src/main/java/org/opensearch/performanceanalyzer/rca/framework/api/metrics/SearchBackPressureStats.java b/src/main/java/org/opensearch/performanceanalyzer/rca/framework/api/metrics/SearchBackPressureStats.java new file mode 100644 index 000000000..ae5c59814 --- /dev/null +++ b/src/main/java/org/opensearch/performanceanalyzer/rca/framework/api/metrics/SearchBackPressureStats.java @@ -0,0 +1,15 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ + +package org.opensearch.performanceanalyzer.rca.framework.api.metrics; + + +import org.opensearch.performanceanalyzer.rca.framework.api.Metric; + +public class SearchBackPressureStats extends Metric { + public SearchBackPressureStats(long evaluationIntervalSeconds) { + super("searchbp_shard_stats_cancellationCount", evaluationIntervalSeconds); + } +} diff --git a/src/main/java/org/opensearch/performanceanalyzer/rca/framework/metrics/ReaderMetrics.java b/src/main/java/org/opensearch/performanceanalyzer/rca/framework/metrics/ReaderMetrics.java index 4c9fc5a04..ec6ce0fd9 100644 --- a/src/main/java/org/opensearch/performanceanalyzer/rca/framework/metrics/ReaderMetrics.java +++ b/src/main/java/org/opensearch/performanceanalyzer/rca/framework/metrics/ReaderMetrics.java @@ -86,7 +86,13 @@ public enum ReaderMetrics implements MeasurementSet { "FaultDetectionMetricsEmitterExecutionTime", "millis", StatsType.LATENCIES, - Statistics.SUM); + Statistics.SUM), + SEARCH_BACK_PRESSURE_METRICS_EMITTER_EXECUTION_TIME( + "SearchBackPressureMetricsEmitterExecutionTime", + "millis", + StatsType.LATENCIES, + Statistics.SUM), + ; /** What we want to appear as the metric name. */ private String name; diff --git a/src/main/java/org/opensearch/performanceanalyzer/rca/store/OpenSearchAnalysisGraph.java b/src/main/java/org/opensearch/performanceanalyzer/rca/store/OpenSearchAnalysisGraph.java index e144f2ee1..80763befb 100644 --- a/src/main/java/org/opensearch/performanceanalyzer/rca/store/OpenSearchAnalysisGraph.java +++ b/src/main/java/org/opensearch/performanceanalyzer/rca/store/OpenSearchAnalysisGraph.java @@ -183,7 +183,8 @@ public void construct() { // Use EVALUATION_INTERVAL_SECONDS instead of RCA_PERIOD which resolved to 12 seconds. // This is resulting in this RCA not getting executed in every 5 seconds. Rca> threadMetricsRca = - new ThreadMetricsRca(threadBlockedTime, threadWaitedTime, EVALUATION_INTERVAL_SECONDS); + new ThreadMetricsRca( + threadBlockedTime, threadWaitedTime, EVALUATION_INTERVAL_SECONDS); threadMetricsRca.addTag( RcaConsts.RcaTagConstants.TAG_LOCUS, RcaConsts.RcaTagConstants.LOCUS_DATA_CLUSTER_MANAGER_NODE); diff --git a/src/main/java/org/opensearch/performanceanalyzer/reader/MetricsEmitter.java b/src/main/java/org/opensearch/performanceanalyzer/reader/MetricsEmitter.java index d209bf7f1..b060fb346 100644 --- a/src/main/java/org/opensearch/performanceanalyzer/reader/MetricsEmitter.java +++ b/src/main/java/org/opensearch/performanceanalyzer/reader/MetricsEmitter.java @@ -749,6 +749,120 @@ public static void emitGarbageCollectionInfo( ReaderMetrics.GC_INFO_EMITTER_EXECUTION_TIME, mFinalT - mCurrT); } + public static void emitSearchBackPressureMetrics( + MetricsDB metricsDB, + SearchBackPressureMetricsSnapShot searchBackPressureMetricsSnapShot) { + long mCurrT = System.currentTimeMillis(); + Result searchbp_records = searchBackPressureMetricsSnapShot.fetchAll(); + + // String SEARCHBP_MODE_DIM = "searchbp_mode"; + String SEARCHBP_TYPE_DIM = "SearchBackPressureStats"; + String SEARCHBP_TABLE_NAME = "searchbp_stats"; + + List dims = + new ArrayList() { + { + this.add(SEARCHBP_TYPE_DIM); + } + }; + + List stats_types = + new ArrayList() { + { + // Shard/Task Stats Cancellation Count + this.add( + AllMetrics.SearchBackPressureStatsValue + .SEARCHBP_SHARD_STATS_CANCELLATIONCOUNT + .toString()); + this.add( + AllMetrics.SearchBackPressureStatsValue + .SEARCHBP_TASK_STATS_CANCELLATIONCOUNT + .toString()); + // Shard Stats Resource Heap / CPU Usage + this.add( + AllMetrics.SearchBackPressureStatsValue + .SEARCHBP_SHARD_STATS_RESOURCE_HEAP_USAGE_CANCELLATIONCOUNT + .toString()); + this.add( + AllMetrics.SearchBackPressureStatsValue + .SEARCHBP_SHARD_STATS_RESOURCE_HEAP_USAGE_CURRENTMAX + .toString()); + this.add( + AllMetrics.SearchBackPressureStatsValue + .SEARCHBP_SHARD_STATS_RESOURCE_HEAP_USAGE_ROLLINGAVG + .toString()); + this.add( + AllMetrics.SearchBackPressureStatsValue + .SEARCHBP_SHARD_STATS_RESOURCE_CPU_USAGE_CANCELLATIONCOUNT + .toString()); + this.add( + AllMetrics.SearchBackPressureStatsValue + .SEARCHBP_SHARD_STATS_RESOURCE_CPU_USAGE_CURRENTMAX + .toString()); + this.add( + AllMetrics.SearchBackPressureStatsValue + .SEARCHBP_SHARD_STATS_RESOURCE_CPU_USAGE_CURRENTAVG + .toString()); + // Task Stats Resource Heap / CPU Usage + this.add( + AllMetrics.SearchBackPressureStatsValue + .SEARCHBP_TASK_STATS_RESOURCE_HEAP_USAGE_CANCELLATIONCOUNT + .toString()); + this.add( + AllMetrics.SearchBackPressureStatsValue + .SEARCHBP_TASK_STATS_RESOURCE_HEAP_USAGE_CURRENTMAX + .toString()); + this.add( + AllMetrics.SearchBackPressureStatsValue + .SEARCHBP_TASK_STATS_RESOURCE_HEAP_USAGE_ROLLINGAVG + .toString()); + this.add( + AllMetrics.SearchBackPressureStatsValue + .SEARCHBP_TASK_STATS_RESOURCE_CPU_USAGE_CANCELLATIONCOUNT + .toString()); + this.add( + AllMetrics.SearchBackPressureStatsValue + .SEARCHBP_TASK_STATS_RESOURCE_CPU_USAGE_CURRENTMAX + .toString()); + this.add( + AllMetrics.SearchBackPressureStatsValue + .SEARCHBP_TASK_STATS_RESOURCE_CPU_USAGE_CURRENTAVG + .toString()); + } + }; + + metricsDB.createMetric(new Metric<>(SEARCHBP_TABLE_NAME, 0d), dims); + + BatchBindStep handle = metricsDB.startBatchPut(new Metric<>(SEARCHBP_TABLE_NAME, 0d), dims); + + for (Record record : searchbp_records) { + for (String stats_type : stats_types) { + Optional tmpStatsObj = Optional.ofNullable(record.get(stats_type)); + // LOG.info(stats_type + " is: " + tmpStatsObj.map(o -> + // Long.parseLong(o.toString())).toString()); + + handle.bind( + stats_type, + // the rest are agg fields: sum, avg, min, max which don't make sense for + // searchbackpressure + tmpStatsObj.map(o -> Long.parseLong(o.toString())).orElse(0L), + tmpStatsObj.map(o -> Long.parseLong(o.toString())).orElse(0L), + tmpStatsObj.map(o -> Long.parseLong(o.toString())).orElse(0L), + tmpStatsObj.map(o -> Long.parseLong(o.toString())).orElse(0L)); + } + } + + handle.execute(); + + long mFinalT = System.currentTimeMillis(); + LOG.debug( + "Total time taken for writing Search Back Pressure info into metricsDB: {}", + mFinalT - mCurrT); + ServiceMetrics.READER_METRICS_AGGREGATOR.updateStat( + ReaderMetrics.SEARCH_BACK_PRESSURE_METRICS_EMITTER_EXECUTION_TIME, + mFinalT - mCurrT); + } + public static void emitAdmissionControlMetrics( MetricsDB metricsDB, AdmissionControlSnapshot snapshot) { long mCurrT = System.currentTimeMillis(); diff --git a/src/main/java/org/opensearch/performanceanalyzer/reader/ReaderMetricsProcessor.java b/src/main/java/org/opensearch/performanceanalyzer/reader/ReaderMetricsProcessor.java index 512c52f6d..3b446d95e 100644 --- a/src/main/java/org/opensearch/performanceanalyzer/reader/ReaderMetricsProcessor.java +++ b/src/main/java/org/opensearch/performanceanalyzer/reader/ReaderMetricsProcessor.java @@ -70,6 +70,7 @@ public class ReaderMetricsProcessor implements Runnable { clusterManagerThrottlingMetricsMap; private NavigableMap shardStateMetricsMap; private NavigableMap admissionControlMetricsMap; + private NavigableMap searchBackPressureMetricsMap; private static final int MAX_DATABASES = 2; private static final int OS_SNAPSHOTS = 4; @@ -81,6 +82,7 @@ public class ReaderMetricsProcessor implements Runnable { private static final int GC_INFO_SNAPSHOTS = 4; private static final int CLUSTER_MANAGER_THROTTLING_SNAPSHOTS = 2; private static final int AC_SNAPSHOTS = 2; + private static final int SEARCH_BP_SNAPSHOTS = 4; private final String rootLocation; private final AppContext appContext; @@ -125,6 +127,8 @@ public ReaderMetricsProcessor( gcInfoMap = new TreeMap<>(); clusterManagerThrottlingMetricsMap = new TreeMap<>(); admissionControlMetricsMap = new TreeMap<>(); + searchBackPressureMetricsMap = new TreeMap<>(); + this.rootLocation = rootLocation; this.configOverridesApplier = new ConfigOverridesApplier(); @@ -268,6 +272,7 @@ public void trimOldSnapshots() throws Exception { trimMap(gcInfoMap, GC_INFO_SNAPSHOTS); trimMap(clusterManagerThrottlingMetricsMap, CLUSTER_MANAGER_THROTTLING_SNAPSHOTS); trimMap(admissionControlMetricsMap, AC_SNAPSHOTS); + trimMap(searchBackPressureMetricsMap, SEARCH_BP_SNAPSHOTS); for (NavigableMap snap : nodeMetricsMap.values()) { // do the same thing as OS_SNAPSHOTS. Eventually MemoryDBSnapshot @@ -397,6 +402,7 @@ private void emitMetrics(long currWindowStartTime) throws Exception { emitAdmissionControlMetrics(prevWindowStartTime, metricsDB); emitClusterManagerMetrics(prevWindowStartTime, metricsDB); emitClusterManagerThrottlingMetrics(prevWindowStartTime, metricsDB); + emitSearchBackPressureMetrics(prevWindowStartTime, metricsDB); metricsDB.commit(); metricsDBMap.put(prevWindowStartTime, metricsDB); @@ -594,6 +600,19 @@ private void emitClusterManagerThrottlingMetrics( } } + private void emitSearchBackPressureMetrics(long prevWindowStartTime, MetricsDB metricsDB) + throws Exception { + if (searchBackPressureMetricsMap.containsKey(prevWindowStartTime)) { + SearchBackPressureMetricsSnapShot prevSearchBPSnapShot = + searchBackPressureMetricsMap.get(prevWindowStartTime); + MetricsEmitter.emitSearchBackPressureMetrics(metricsDB, prevSearchBPSnapShot); + } else { + LOG.debug( + "Search Back Pressure snapshot does not exist for the previous window. " + + "Not emitting metrics."); + } + } + /** * OS, Request, Http and cluster_manager first aligns the currentTimeStamp with a 5 second * interval. In the current format, a file (previously a directory) is written every 5 seconds. @@ -679,6 +698,9 @@ is ready so it starts to read that file (go back two windows and EventProcessor admissionControlProcessor = AdmissionControlProcessor.build( currWindowStartTime, conn, admissionControlMetricsMap); + EventProcessor searchBackPressureMetricsProcessor = + SearchBackPressureMetricsProcessor.buildSearchBackPressureMetricsProcessor( + currWindowStartTime, conn, searchBackPressureMetricsMap); // The event dispatcher dispatches events to each of the registered event processors. // In addition to event processing each processor has an initialize/finalize function that @@ -702,6 +724,7 @@ is ready so it starts to read that file (go back two windows and eventDispatcher.registerEventProcessor(faultDetectionProcessor); eventDispatcher.registerEventProcessor(garbageCollectorInfoProcessor); eventDispatcher.registerEventProcessor(admissionControlProcessor); + eventDispatcher.registerEventProcessor(searchBackPressureMetricsProcessor); eventDispatcher.initializeProcessing( currWindowStartTime, currWindowStartTime + MetricsConfiguration.SAMPLING_INTERVAL); diff --git a/src/main/java/org/opensearch/performanceanalyzer/reader/SearchBackPressureMetricsProcessor.java b/src/main/java/org/opensearch/performanceanalyzer/reader/SearchBackPressureMetricsProcessor.java new file mode 100644 index 000000000..8c6e93d8c --- /dev/null +++ b/src/main/java/org/opensearch/performanceanalyzer/reader/SearchBackPressureMetricsProcessor.java @@ -0,0 +1,197 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ + +package org.opensearch.performanceanalyzer.reader; + + +import java.sql.Connection; +import java.util.ArrayList; +import java.util.Map; +import java.util.NavigableMap; +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; +import org.jooq.BatchBindStep; +import org.opensearch.performanceanalyzer.commons.event_process.Event; +import org.opensearch.performanceanalyzer.commons.event_process.EventProcessor; +import org.opensearch.performanceanalyzer.commons.metrics.AllMetrics; +import org.opensearch.performanceanalyzer.commons.metrics.PerformanceAnalyzerMetrics; +import org.opensearch.performanceanalyzer.commons.util.JsonConverter; + +public class SearchBackPressureMetricsProcessor implements EventProcessor { + + private static final Logger LOG = + LogManager.getLogger(SearchBackPressureMetricsProcessor.class); + + // instance of SearchBackPressureMetricsSnapShot to interact with the backend db + private SearchBackPressureMetricsSnapShot searchBackPressureMetricsSnapShot; + + // entry point for batch queries + private BatchBindStep handle; + + // normally starTime and endTime are gapped by 5 seconds (default sampling interval) + private long startTime; + private long endTime; + + private SearchBackPressureMetricsProcessor( + SearchBackPressureMetricsSnapShot searchBackPressureMetricsSnapShot) { + this.searchBackPressureMetricsSnapShot = searchBackPressureMetricsSnapShot; + } + + /* + * if current SnapShotMap has the snapshot for currentWindowStartTime, use the snapshot to build the processor + * else create a new Instance of SearchBackPressureMetricsSnapShot to initialize the processor + */ + static SearchBackPressureMetricsProcessor buildSearchBackPressureMetricsProcessor( + long currentWindowStartTime, + Connection connection, + NavigableMap + searchBackPressureSnapshotNavigableMap) { + // if current metrics is in searchBackPressureSnapshotNavigableMap map + if (searchBackPressureSnapshotNavigableMap.get(currentWindowStartTime) == null) { + SearchBackPressureMetricsSnapShot searchBackPressureMetricsSnapShot = + new SearchBackPressureMetricsSnapShot(connection, currentWindowStartTime); + searchBackPressureSnapshotNavigableMap.put( + currentWindowStartTime, searchBackPressureMetricsSnapShot); + return new SearchBackPressureMetricsProcessor(searchBackPressureMetricsSnapShot); + } + + return new SearchBackPressureMetricsProcessor( + searchBackPressureSnapshotNavigableMap.get(currentWindowStartTime)); + } + + @Override + public void initializeProcessing(long startTime, long endTime) { + this.startTime = startTime; + this.endTime = endTime; + this.handle = searchBackPressureMetricsSnapShot.startBatchPut(); + } + + @Override + public void finalizeProcessing() { + if (handle.size() > 0) { + handle.execute(); + } + } + + @Override + public boolean shouldProcessEvent(Event event) { + return event.key.contains(PerformanceAnalyzerMetrics.sSearchBackPressureMetricsPath); + } + + @Override + public void commitBatchIfRequired() { + if (handle.size() >= BATCH_LIMIT) { + handle.execute(); + handle = searchBackPressureMetricsSnapShot.startBatchPut(); + } + } + + // Handler method for incoming events + private void handleSearchBackPressureEvent(String eventValue) { + String[] lines = eventValue.split(System.lineSeparator()); + // 0thline is current time string (e.g. {current_time:1686952296889}) + // 1st line is the payload the metrics + if (lines.length < 2) { + throw new RuntimeException("Missing SearchBackPressure Metrics payload and timestamp."); + // return; + } + + // Parse metrics payload + parseJsonLine(lines[1]); + } + + private void parseJsonLine(final String jsonString) { + Map map = JsonConverter.createMapFrom(jsonString); + + if (map.isEmpty()) { + throw new RuntimeException("Missing SearchBackPressure Metrics payload."); + // return; + } + // A list of dims to be collected + ArrayList required_searchbp_dims = + new ArrayList() { + { + // Shard/Task Stats Cancellation Count + this.add( + AllMetrics.SearchBackPressureStatsValue + .SEARCHBP_SHARD_STATS_CANCELLATIONCOUNT + .toString()); + this.add( + AllMetrics.SearchBackPressureStatsValue + .SEARCHBP_TASK_STATS_CANCELLATIONCOUNT + .toString()); + + // Shard Stats Resource Heap / CPU Usage + this.add( + AllMetrics.SearchBackPressureStatsValue + .SEARCHBP_SHARD_STATS_RESOURCE_HEAP_USAGE_CANCELLATIONCOUNT + .toString()); + this.add( + AllMetrics.SearchBackPressureStatsValue + .SEARCHBP_SHARD_STATS_RESOURCE_HEAP_USAGE_CURRENTMAX + .toString()); + this.add( + AllMetrics.SearchBackPressureStatsValue + .SEARCHBP_SHARD_STATS_RESOURCE_HEAP_USAGE_ROLLINGAVG + .toString()); + this.add( + AllMetrics.SearchBackPressureStatsValue + .SEARCHBP_SHARD_STATS_RESOURCE_CPU_USAGE_CANCELLATIONCOUNT + .toString()); + this.add( + AllMetrics.SearchBackPressureStatsValue + .SEARCHBP_SHARD_STATS_RESOURCE_CPU_USAGE_CURRENTMAX + .toString()); + this.add( + AllMetrics.SearchBackPressureStatsValue + .SEARCHBP_SHARD_STATS_RESOURCE_CPU_USAGE_CURRENTAVG + .toString()); + + // Task Stats Resource Heap / CPU Usage + this.add( + AllMetrics.SearchBackPressureStatsValue + .SEARCHBP_TASK_STATS_RESOURCE_HEAP_USAGE_CANCELLATIONCOUNT + .toString()); + this.add( + AllMetrics.SearchBackPressureStatsValue + .SEARCHBP_TASK_STATS_RESOURCE_HEAP_USAGE_CURRENTMAX + .toString()); + this.add( + AllMetrics.SearchBackPressureStatsValue + .SEARCHBP_TASK_STATS_RESOURCE_HEAP_USAGE_ROLLINGAVG + .toString()); + this.add( + AllMetrics.SearchBackPressureStatsValue + .SEARCHBP_TASK_STATS_RESOURCE_CPU_USAGE_CANCELLATIONCOUNT + .toString()); + this.add( + AllMetrics.SearchBackPressureStatsValue + .SEARCHBP_TASK_STATS_RESOURCE_CPU_USAGE_CURRENTMAX + .toString()); + this.add( + AllMetrics.SearchBackPressureStatsValue + .SEARCHBP_TASK_STATS_RESOURCE_CPU_USAGE_CURRENTAVG + .toString()); + } + }; + + Object[] bindVals = new Object[required_searchbp_dims.size()]; + int idx = 0; + for (String dimension : required_searchbp_dims) { + bindVals[idx++] = map.get(dimension); + } + + handle.bind(bindVals); + } + + @Override + public void processEvent(Event event) { + // Handler method for incoming event + handleSearchBackPressureEvent(event.value); + + // commit Batch queries is overflow the limit + commitBatchIfRequired(); + } +} diff --git a/src/main/java/org/opensearch/performanceanalyzer/reader/SearchBackPressureMetricsSnapShot.java b/src/main/java/org/opensearch/performanceanalyzer/reader/SearchBackPressureMetricsSnapShot.java new file mode 100644 index 000000000..b995cbe44 --- /dev/null +++ b/src/main/java/org/opensearch/performanceanalyzer/reader/SearchBackPressureMetricsSnapShot.java @@ -0,0 +1,179 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ + +package org.opensearch.performanceanalyzer.reader; + + +import java.sql.Connection; +import java.util.ArrayList; +import java.util.List; +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; +import org.jooq.BatchBindStep; +import org.jooq.DSLContext; +import org.jooq.Field; +import org.jooq.Record; +import org.jooq.Result; +import org.jooq.SQLDialect; +import org.jooq.impl.DSL; +import org.opensearch.performanceanalyzer.commons.metrics.AllMetrics; + +public class SearchBackPressureMetricsSnapShot implements Removable { + + // Logger for current class + private static final Logger LOG = LogManager.getLogger(SearchBackPressureMetricsSnapShot.class); + + // entry point to interact with SQLite db + private final DSLContext create; + + private final String tableName; + private List> columns; + + // Global variables for naming + private static final String SEARCHBP_CONTROLLER_NAME_VALUE = "ControllerName"; + private static final String SEARCHBP_MODE_VALUE = "searchbp_mode"; + + // Create a table with specifed fields (columns) + public SearchBackPressureMetricsSnapShot(Connection conn, Long windowStartTime) { + this.create = DSL.using(conn, SQLDialect.SQLITE); + this.tableName = "search_back_pressure_" + windowStartTime; + + // Add the ControllerName, searchbp_mode columns in the table + this.columns = + new ArrayList>() { + { + // Shard/Task Stats Cancellation Count + this.add( + DSL.field( + DSL.name( + AllMetrics.SearchBackPressureStatsValue + .SEARCHBP_SHARD_STATS_CANCELLATIONCOUNT + .toString()), + Long.class)); + this.add( + DSL.field( + DSL.name( + AllMetrics.SearchBackPressureStatsValue + .SEARCHBP_TASK_STATS_CANCELLATIONCOUNT + .toString()), + Long.class)); + + // Shard Stats Resource Heap / CPU Usage + this.add( + DSL.field( + DSL.name( + AllMetrics.SearchBackPressureStatsValue + .SEARCHBP_SHARD_STATS_RESOURCE_HEAP_USAGE_CANCELLATIONCOUNT + .toString()), + Long.class)); + this.add( + DSL.field( + DSL.name( + AllMetrics.SearchBackPressureStatsValue + .SEARCHBP_SHARD_STATS_RESOURCE_HEAP_USAGE_CURRENTMAX + .toString()), + Long.class)); + this.add( + DSL.field( + DSL.name( + AllMetrics.SearchBackPressureStatsValue + .SEARCHBP_SHARD_STATS_RESOURCE_HEAP_USAGE_ROLLINGAVG + .toString()), + Long.class)); + this.add( + DSL.field( + DSL.name( + AllMetrics.SearchBackPressureStatsValue + .SEARCHBP_SHARD_STATS_RESOURCE_CPU_USAGE_CANCELLATIONCOUNT + .toString()), + Long.class)); + + this.add( + DSL.field( + DSL.name( + AllMetrics.SearchBackPressureStatsValue + .SEARCHBP_SHARD_STATS_RESOURCE_CPU_USAGE_CURRENTMAX + .toString()), + Long.class)); + this.add( + DSL.field( + DSL.name( + AllMetrics.SearchBackPressureStatsValue + .SEARCHBP_SHARD_STATS_RESOURCE_CPU_USAGE_CURRENTAVG + .toString()), + Long.class)); + + // Task Stats Resource Heap / CPU Usage + this.add( + DSL.field( + DSL.name( + AllMetrics.SearchBackPressureStatsValue + .SEARCHBP_TASK_STATS_RESOURCE_HEAP_USAGE_CANCELLATIONCOUNT + .toString()), + Long.class)); + this.add( + DSL.field( + DSL.name( + AllMetrics.SearchBackPressureStatsValue + .SEARCHBP_TASK_STATS_RESOURCE_HEAP_USAGE_CURRENTMAX + .toString()), + Long.class)); + this.add( + DSL.field( + DSL.name( + AllMetrics.SearchBackPressureStatsValue + .SEARCHBP_TASK_STATS_RESOURCE_HEAP_USAGE_ROLLINGAVG + .toString()), + Long.class)); + this.add( + DSL.field( + DSL.name( + AllMetrics.SearchBackPressureStatsValue + .SEARCHBP_TASK_STATS_RESOURCE_CPU_USAGE_CANCELLATIONCOUNT + .toString()), + Long.class)); + + this.add( + DSL.field( + DSL.name( + AllMetrics.SearchBackPressureStatsValue + .SEARCHBP_TASK_STATS_RESOURCE_CPU_USAGE_CURRENTMAX + .toString()), + Long.class)); + this.add( + DSL.field( + DSL.name( + AllMetrics.SearchBackPressureStatsValue + .SEARCHBP_TASK_STATS_RESOURCE_CPU_USAGE_CURRENTAVG + .toString()), + Long.class)); + } + }; + + // create table with columns specified + create.createTable(tableName).columns(columns).execute(); + } + + public DSLContext getDSLContext() { + return create; + } + + public BatchBindStep startBatchPut() { + List dummyValues = new ArrayList<>(); + for (int i = 0; i < columns.size(); i++) { + dummyValues.add(null); + } + return create.batch(create.insertInto(DSL.table(this.tableName)).values(dummyValues)); + } + + public Result fetchAll() { + return create.select().from(DSL.table(tableName)).fetch(); + } + + @Override + public void remove() throws Exception { + create.dropTable(DSL.table(tableName)).execute(); + } +} diff --git a/src/test/java/org/opensearch/performanceanalyzer/reader/SearchBackPressureMetricsProcessorTest.java b/src/test/java/org/opensearch/performanceanalyzer/reader/SearchBackPressureMetricsProcessorTest.java new file mode 100644 index 000000000..6abbf4a90 --- /dev/null +++ b/src/test/java/org/opensearch/performanceanalyzer/reader/SearchBackPressureMetricsProcessorTest.java @@ -0,0 +1,161 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ + +package org.opensearch.performanceanalyzer.reader; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; + +import java.sql.Connection; +import java.sql.DriverManager; +import java.util.NavigableMap; +import java.util.TreeMap; +import org.jooq.Record; +import org.jooq.Result; +import org.junit.Assert; +import org.junit.Before; +import org.junit.Test; +import org.opensearch.performanceanalyzer.commons.event_process.Event; +import org.opensearch.performanceanalyzer.commons.metrics.AllMetrics; +import org.opensearch.performanceanalyzer.commons.metrics.PerformanceAnalyzerMetrics; + +public class SearchBackPressureMetricsProcessorTest { + private static final String DB_URL = "jdbc:sqlite:"; + // private static final String TEST_MEM_POOL = "testMemPool"; + // private static final String COLLECTOR_NAME = "testCollectorName"; + private static final String SEARCH_BACK_PRESSURE_STATS_KEY = "search_back_pressure_stats"; + private SearchBackPressureMetricsProcessor searchBackPressureMetricsProcessor; + private long currTimeStamp; + + private NavigableMap searchBackPressureStatsMap; + Connection conn; + + // mock SearchBackPressureStatsCollector to test Event processing + private static final String SERIALIZED_EVENT = + "{\"searchbp_shard_stats_cancellationCount\":2," + + "\"searchbp_shard_stats_limitReachedCount\":2," + + "\"searchbp_shard_stats_resource_heap_usage_cancellationCount\":3," + + "\"searchbp_shard_stats_resource_heap_usage_currentMax\":3," + + "\"searchbp_shard_stats_resource_heap_usage_rollingAvg\":3," + + "\"searchbp_shard_stats_resource_cpu_usage_cancellationCount\":5," + + "\"searchbp_shard_stats_resource_cpu_usage_currentMax\":5," + + "\"searchbp_shard_stats_resource_cpu_usage_currentAvg\":5," + + "\"searchbp_shard_stats_resource_elaspedtime_usage_cancellationCount\":2," + + "\"searchbp_shard_stats_resource_elaspedtime_usage_currentMax\":2," + + "\"searchbp_shard_stats_resource_elaspedtime_usage_currentAvg\":2," + + "\"searchbp_task_stats_cancellationCount\":0," + + "\"searchbp_task_stats_limitReachedCount\":0," + + "\"searchbp_task_stats_resource_heap_usage_cancellationCount\":0," + + "\"searchbp_task_stats_resource_heap_usage_currentMax\":0," + + "\"searchbp_task_stats_resource_heap_usage_rollingAvg\":0," + + "\"searchbp_task_stats_resource_cpu_usage_cancellationCount\":0," + + "\"searchbp_task_stats_resource_cpu_usage_currentMax\":0," + + "\"searchbp_task_stats_resource_cpu_usage_currentAvg\":0," + + "\"searchbp_task_stats_resource_elaspedtime_usage_cancellationCount\":0," + + "\"searchbp_task_stats_resource_elaspedtime_usage_currentMax\":0," + + "\"searchbp_task_stats_resource_elaspedtime_usage_currentAvg\":0," + + "\"searchbp_mode\":\"MONITOR_ONLY\"," + + "\"searchbp_nodeid\":\"FgNAAAQQQDSROABCDEFHTX\"}"; + + @Before + public void setup() throws Exception { + Class.forName("org.sqlite.JDBC"); + System.setProperty("java.io.tmpdir", "/tmp"); + conn = DriverManager.getConnection(DB_URL); + this.currTimeStamp = System.currentTimeMillis(); + this.searchBackPressureStatsMap = new TreeMap<>(); + this.searchBackPressureMetricsProcessor = + searchBackPressureMetricsProcessor.buildSearchBackPressureMetricsProcessor( + currTimeStamp, conn, searchBackPressureStatsMap); + } + + // Test valid case of the handleSearchBackPressureEvent() + @Test + public void testSearchBackPressureProcessEvent() throws Exception { + // Create a SearchBackPressureEvent + Event testEvent = buildTestSearchBackPressureStatsEvent(); + + // Test the SearchBackPressureMetricsSnapShot + searchBackPressureMetricsProcessor.initializeProcessing( + this.currTimeStamp, System.currentTimeMillis()); + assertTrue(searchBackPressureMetricsProcessor.shouldProcessEvent(testEvent)); + + searchBackPressureMetricsProcessor.processEvent(testEvent); + searchBackPressureMetricsProcessor.finalizeProcessing(); + + SearchBackPressureMetricsSnapShot currSnapshot = + searchBackPressureStatsMap.get(this.currTimeStamp); + Result result = currSnapshot.fetchAll(); + assertEquals(1, result.size()); + + // SEARCHBP_SHARD_STATS_RESOURCE_HEAP_USAGE_ROLLINGAVG value is 3L according to the + // SERIALIZED_EVENT, should EQUAL + Assert.assertEquals( + 3L, + result.get(0) + .get( + AllMetrics.SearchBackPressureStatsValue + .SEARCHBP_SHARD_STATS_RESOURCE_HEAP_USAGE_ROLLINGAVG + .toString())); + // SEARCHBP_TASK_STATS_RESOURCE_CPU_USAGE_CANCELLATIONCOUNT value is 0L according to the + // SERIALIZED_EVENT, should EQUAL + Assert.assertEquals( + 0L, + result.get(0) + .get( + AllMetrics.SearchBackPressureStatsValue + .SEARCHBP_TASK_STATS_RESOURCE_CPU_USAGE_CANCELLATIONCOUNT + .toString())); + + // SEARCHBP_TASK_STATS_RESOURCE_CPU_USAGE_CANCELLATIONCOUNT value is 0L according to the + // SERIALIZED_EVENT, should NOT EQUAL + Assert.assertNotEquals( + 2L, + result.get(0) + .get( + AllMetrics.SearchBackPressureStatsValue + .SEARCHBP_TASK_STATS_RESOURCE_CPU_USAGE_CANCELLATIONCOUNT + .toString())); + } + + @Test + public void testEmptySearchBackPressureProcessEvent() throws Exception { + // Create a SearchBackPressureEvent + Event testEvent = buildEmptyTestSearchBackPressureStatsEvent(); + + // Test the SearchBackPressureMetricsSnapShot + searchBackPressureMetricsProcessor.initializeProcessing( + this.currTimeStamp, System.currentTimeMillis()); + assertTrue(searchBackPressureMetricsProcessor.shouldProcessEvent(testEvent)); + + try { + searchBackPressureMetricsProcessor.processEvent(testEvent); + Assert.assertFalse( + "Negative scenario test: Should catch a RuntimeException and skip this test", + true); + } catch (RuntimeException ex) { + // should catch the exception and the previous assertion should not be executed + } + } + + private Event buildTestSearchBackPressureStatsEvent() { + StringBuilder str = new StringBuilder(); + str.append(PerformanceAnalyzerMetrics.getJsonCurrentMilliSeconds()) + .append(PerformanceAnalyzerMetrics.sMetricNewLineDelimitor); + + str.append(SERIALIZED_EVENT).append(PerformanceAnalyzerMetrics.sMetricNewLineDelimitor); + return new Event( + SEARCH_BACK_PRESSURE_STATS_KEY, str.toString(), System.currentTimeMillis()); + } + + private Event buildEmptyTestSearchBackPressureStatsEvent() { + StringBuilder str = new StringBuilder(); + str.append(PerformanceAnalyzerMetrics.getJsonCurrentMilliSeconds()) + .append(PerformanceAnalyzerMetrics.sMetricNewLineDelimitor); + + return new Event( + SEARCH_BACK_PRESSURE_STATS_KEY, str.toString(), System.currentTimeMillis()); + } +} diff --git a/src/test/java/org/opensearch/performanceanalyzer/reader/SearchBackPressureMetricsSnapShotTest.java b/src/test/java/org/opensearch/performanceanalyzer/reader/SearchBackPressureMetricsSnapShotTest.java new file mode 100644 index 000000000..eeaa1a30f --- /dev/null +++ b/src/test/java/org/opensearch/performanceanalyzer/reader/SearchBackPressureMetricsSnapShotTest.java @@ -0,0 +1,136 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ + +package org.opensearch.performanceanalyzer.reader; + +import static org.junit.Assert.assertEquals; + +import java.sql.Connection; +import java.sql.DriverManager; +import java.util.ArrayList; +import org.jooq.BatchBindStep; +import org.jooq.Record; +import org.jooq.Result; +import org.junit.After; +import org.junit.Assert; +import org.junit.Before; +import org.junit.Test; +import org.opensearch.performanceanalyzer.commons.metrics.AllMetrics; + +public class SearchBackPressureMetricsSnapShotTest { + private static final String DB_URL = "jdbc:sqlite:"; + private Connection conn; + SearchBackPressureMetricsSnapShot snapshot; + + ArrayList required_searchbp_dims = + new ArrayList() { + { + // Shard/Task Stats Cancellation Count + this.add( + AllMetrics.SearchBackPressureStatsValue + .SEARCHBP_SHARD_STATS_CANCELLATIONCOUNT + .toString()); + this.add( + AllMetrics.SearchBackPressureStatsValue + .SEARCHBP_TASK_STATS_CANCELLATIONCOUNT + .toString()); + + // Shard Stats Resource Heap / CPU Usage + this.add( + AllMetrics.SearchBackPressureStatsValue + .SEARCHBP_SHARD_STATS_RESOURCE_HEAP_USAGE_CANCELLATIONCOUNT + .toString()); + this.add( + AllMetrics.SearchBackPressureStatsValue + .SEARCHBP_SHARD_STATS_RESOURCE_HEAP_USAGE_CURRENTMAX + .toString()); + this.add( + AllMetrics.SearchBackPressureStatsValue + .SEARCHBP_SHARD_STATS_RESOURCE_HEAP_USAGE_ROLLINGAVG + .toString()); + this.add( + AllMetrics.SearchBackPressureStatsValue + .SEARCHBP_SHARD_STATS_RESOURCE_CPU_USAGE_CANCELLATIONCOUNT + .toString()); + this.add( + AllMetrics.SearchBackPressureStatsValue + .SEARCHBP_SHARD_STATS_RESOURCE_CPU_USAGE_CURRENTMAX + .toString()); + this.add( + AllMetrics.SearchBackPressureStatsValue + .SEARCHBP_SHARD_STATS_RESOURCE_CPU_USAGE_CURRENTAVG + .toString()); + + // Task Stats Resource Heap / CPU Usage + this.add( + AllMetrics.SearchBackPressureStatsValue + .SEARCHBP_TASK_STATS_RESOURCE_HEAP_USAGE_CANCELLATIONCOUNT + .toString()); + this.add( + AllMetrics.SearchBackPressureStatsValue + .SEARCHBP_TASK_STATS_RESOURCE_HEAP_USAGE_CURRENTMAX + .toString()); + this.add( + AllMetrics.SearchBackPressureStatsValue + .SEARCHBP_TASK_STATS_RESOURCE_HEAP_USAGE_ROLLINGAVG + .toString()); + this.add( + AllMetrics.SearchBackPressureStatsValue + .SEARCHBP_TASK_STATS_RESOURCE_CPU_USAGE_CANCELLATIONCOUNT + .toString()); + this.add( + AllMetrics.SearchBackPressureStatsValue + .SEARCHBP_TASK_STATS_RESOURCE_CPU_USAGE_CURRENTMAX + .toString()); + this.add( + AllMetrics.SearchBackPressureStatsValue + .SEARCHBP_TASK_STATS_RESOURCE_CPU_USAGE_CURRENTAVG + .toString()); + } + }; + + @Before + public void setup() throws Exception { + Class.forName("org.sqlite.JDBC"); + System.setProperty("java.io.tmpdir", "/tmp"); + conn = DriverManager.getConnection(DB_URL); + snapshot = new SearchBackPressureMetricsSnapShot(conn, System.currentTimeMillis()); + } + + @Test + public void testReadSearchBackPressureMetricsSnapshot() throws Exception { + final BatchBindStep handle = snapshot.startBatchPut(); + insertIntoTable(handle); + + final Result result = snapshot.fetchAll(); + + assertEquals(1, result.size()); + // for 14 (length of required_searchbp_dims) fields, each assign a value from 0 to 13 + // test each field and verify the result + for (long i = 0; i < required_searchbp_dims.size(); i++) { + Assert.assertEquals( + AllMetrics.SearchBackPressureStatsValue.SEARCHBP_SHARD_STATS_CANCELLATIONCOUNT + .toString() + + " should be " + + String.valueOf(i), + i, + result.get(0).get(required_searchbp_dims.get((int) i))); + } + } + + @After + public void tearDown() throws Exception { + conn.close(); + } + + private void insertIntoTable(BatchBindStep handle) { + Object[] bindVals = new Object[required_searchbp_dims.size()]; + for (int i = 0; i < required_searchbp_dims.size(); i++) { + bindVals[i] = Long.valueOf(i); + } + + handle.bind(bindVals).execute(); + } +} From 92c3fc808a7aff3e914e44797847b8bbbb2b5264 Mon Sep 17 00:00:00 2001 From: CoderJeffrey Date: Thu, 22 Jun 2023 11:38:55 -0700 Subject: [PATCH 02/73] Remove extra files (Signed-off-by: Jeffrey Liu ujeffliu@amazon.com) Signed-off-by: CoderJeffrey --- .../api/metrics/SearchBackPressureStats.java | 15 --------------- .../SearchBackPressureMetricsProcessor.java | 2 +- 2 files changed, 1 insertion(+), 16 deletions(-) delete mode 100644 src/main/java/org/opensearch/performanceanalyzer/rca/framework/api/metrics/SearchBackPressureStats.java diff --git a/src/main/java/org/opensearch/performanceanalyzer/rca/framework/api/metrics/SearchBackPressureStats.java b/src/main/java/org/opensearch/performanceanalyzer/rca/framework/api/metrics/SearchBackPressureStats.java deleted file mode 100644 index ae5c59814..000000000 --- a/src/main/java/org/opensearch/performanceanalyzer/rca/framework/api/metrics/SearchBackPressureStats.java +++ /dev/null @@ -1,15 +0,0 @@ -/* - * Copyright OpenSearch Contributors - * SPDX-License-Identifier: Apache-2.0 - */ - -package org.opensearch.performanceanalyzer.rca.framework.api.metrics; - - -import org.opensearch.performanceanalyzer.rca.framework.api.Metric; - -public class SearchBackPressureStats extends Metric { - public SearchBackPressureStats(long evaluationIntervalSeconds) { - super("searchbp_shard_stats_cancellationCount", evaluationIntervalSeconds); - } -} diff --git a/src/main/java/org/opensearch/performanceanalyzer/reader/SearchBackPressureMetricsProcessor.java b/src/main/java/org/opensearch/performanceanalyzer/reader/SearchBackPressureMetricsProcessor.java index 8c6e93d8c..8eec8a831 100644 --- a/src/main/java/org/opensearch/performanceanalyzer/reader/SearchBackPressureMetricsProcessor.java +++ b/src/main/java/org/opensearch/performanceanalyzer/reader/SearchBackPressureMetricsProcessor.java @@ -38,7 +38,7 @@ private SearchBackPressureMetricsProcessor( SearchBackPressureMetricsSnapShot searchBackPressureMetricsSnapShot) { this.searchBackPressureMetricsSnapShot = searchBackPressureMetricsSnapShot; } - + /* * if current SnapShotMap has the snapshot for currentWindowStartTime, use the snapshot to build the processor * else create a new Instance of SearchBackPressureMetricsSnapShot to initialize the processor From a47388af973f39d93f049ef60e9d9b043bf9aa2b Mon Sep 17 00:00:00 2001 From: CoderJeffrey Date: Thu, 22 Jun 2023 11:42:43 -0700 Subject: [PATCH 03/73] Remove styling difference (Signed-off-by: Jeffrey Liu ujeffliu@amazon.com) Signed-off-by: CoderJeffrey --- .../performanceanalyzer/model/MetricAttributes.java | 1 + .../org/opensearch/performanceanalyzer/rca/Version.java | 7 ++----- .../rca/store/OpenSearchAnalysisGraph.java | 3 +-- 3 files changed, 4 insertions(+), 7 deletions(-) diff --git a/src/main/java/org/opensearch/performanceanalyzer/model/MetricAttributes.java b/src/main/java/org/opensearch/performanceanalyzer/model/MetricAttributes.java index 414e266b7..36230b8e5 100644 --- a/src/main/java/org/opensearch/performanceanalyzer/model/MetricAttributes.java +++ b/src/main/java/org/opensearch/performanceanalyzer/model/MetricAttributes.java @@ -14,6 +14,7 @@ public class MetricAttributes { public HashSet dimensionNames; MetricAttributes(String unit, MetricDimension[] dimensions) { + this.unit = unit; this.dimensionNames = new HashSet(); for (MetricDimension dimension : dimensions) { diff --git a/src/main/java/org/opensearch/performanceanalyzer/rca/Version.java b/src/main/java/org/opensearch/performanceanalyzer/rca/Version.java index ac53b4d72..bfc85fcd3 100644 --- a/src/main/java/org/opensearch/performanceanalyzer/rca/Version.java +++ b/src/main/java/org/opensearch/performanceanalyzer/rca/Version.java @@ -22,11 +22,8 @@ public final class Version { * Note: The RCA version is agnostic of OpenSearch version. */ static final class Major { - // Bumping this post the Commons - // Lib(https://github.com/opensearch-project/performance-analyzer-commons/issues/2) - // and Service - // Metrics(https://github.com/opensearch-project/performance-analyzer-commons/issues/8) - // change + // Bumping this post the Commons Lib(https://github.com/opensearch-project/performance-analyzer-commons/issues/2) + // and Service Metrics(https://github.com/opensearch-project/performance-analyzer-commons/issues/8) change static final int RCA_MAJ_VERSION = 1; } diff --git a/src/main/java/org/opensearch/performanceanalyzer/rca/store/OpenSearchAnalysisGraph.java b/src/main/java/org/opensearch/performanceanalyzer/rca/store/OpenSearchAnalysisGraph.java index 80763befb..e144f2ee1 100644 --- a/src/main/java/org/opensearch/performanceanalyzer/rca/store/OpenSearchAnalysisGraph.java +++ b/src/main/java/org/opensearch/performanceanalyzer/rca/store/OpenSearchAnalysisGraph.java @@ -183,8 +183,7 @@ public void construct() { // Use EVALUATION_INTERVAL_SECONDS instead of RCA_PERIOD which resolved to 12 seconds. // This is resulting in this RCA not getting executed in every 5 seconds. Rca> threadMetricsRca = - new ThreadMetricsRca( - threadBlockedTime, threadWaitedTime, EVALUATION_INTERVAL_SECONDS); + new ThreadMetricsRca(threadBlockedTime, threadWaitedTime, EVALUATION_INTERVAL_SECONDS); threadMetricsRca.addTag( RcaConsts.RcaTagConstants.TAG_LOCUS, RcaConsts.RcaTagConstants.LOCUS_DATA_CLUSTER_MANAGER_NODE); From 8b86501bec7ec07d0c87c82515883c61e7dacb72 Mon Sep 17 00:00:00 2001 From: CoderJeffrey Date: Thu, 22 Jun 2023 12:00:58 -0700 Subject: [PATCH 04/73] Remove unnecessary file changes (Signed-off-by: Jeffrey Liu ujeffliu@amazon.com) Signed-off-by: CoderJeffrey --- .../opensearch/performanceanalyzer/model/MetricsModel.java | 5 ----- 1 file changed, 5 deletions(-) diff --git a/src/main/java/org/opensearch/performanceanalyzer/model/MetricsModel.java b/src/main/java/org/opensearch/performanceanalyzer/model/MetricsModel.java index f7c781ac1..5b144ac12 100644 --- a/src/main/java/org/opensearch/performanceanalyzer/model/MetricsModel.java +++ b/src/main/java/org/opensearch/performanceanalyzer/model/MetricsModel.java @@ -464,11 +464,6 @@ public class MetricsModel { MetricUnits.MILLISECOND.toString(), AllMetrics.ShardIndexingPressureDimension.values())); - // Search Back Pressure Metrics - allMetricsInitializer.put( - AllMetrics.SearchBackPressureStatsValue.SEARCHBP_SHARD_STATS_CANCELLATIONCOUNT - .toString(), - new MetricAttributes(MetricUnits.COUNT.toString(), EmptyDimension.values())); ALL_METRICS = Collections.unmodifiableMap(allMetricsInitializer); } } From c6549a9be4dd77e7c9bc1bc5b96883b17fc7aa48 Mon Sep 17 00:00:00 2001 From: CoderJeffrey Date: Mon, 26 Jun 2023 16:06:04 -0700 Subject: [PATCH 05/73] Add RCA_Decider (Signed-off-by: Jeffrey Liu ujeffliu@amazon.com) Signed-off-by: CoderJeffrey --- .../model/MetricsModel.java | 5 + .../configs/SearchBackPressureRcaConfig.java | 32 ++++ .../framework/api/metrics/Searchbp_Stats.java | 18 ++ .../rca/framework/core/RcaConf.java | 5 + .../rca/store/OpenSearchAnalysisGraph.java | 12 +- .../SearchBackPressurClusterRCA.java | 22 +++ .../SearchBackPressureRCA.java | 162 ++++++++++++++++++ 7 files changed, 255 insertions(+), 1 deletion(-) create mode 100644 src/main/java/org/opensearch/performanceanalyzer/rca/configs/SearchBackPressureRcaConfig.java create mode 100644 src/main/java/org/opensearch/performanceanalyzer/rca/framework/api/metrics/Searchbp_Stats.java create mode 100644 src/main/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressurClusterRCA.java create mode 100644 src/main/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressureRCA.java diff --git a/src/main/java/org/opensearch/performanceanalyzer/model/MetricsModel.java b/src/main/java/org/opensearch/performanceanalyzer/model/MetricsModel.java index 5b144ac12..f7c781ac1 100644 --- a/src/main/java/org/opensearch/performanceanalyzer/model/MetricsModel.java +++ b/src/main/java/org/opensearch/performanceanalyzer/model/MetricsModel.java @@ -464,6 +464,11 @@ public class MetricsModel { MetricUnits.MILLISECOND.toString(), AllMetrics.ShardIndexingPressureDimension.values())); + // Search Back Pressure Metrics + allMetricsInitializer.put( + AllMetrics.SearchBackPressureStatsValue.SEARCHBP_SHARD_STATS_CANCELLATIONCOUNT + .toString(), + new MetricAttributes(MetricUnits.COUNT.toString(), EmptyDimension.values())); ALL_METRICS = Collections.unmodifiableMap(allMetricsInitializer); } } diff --git a/src/main/java/org/opensearch/performanceanalyzer/rca/configs/SearchBackPressureRcaConfig.java b/src/main/java/org/opensearch/performanceanalyzer/rca/configs/SearchBackPressureRcaConfig.java new file mode 100644 index 000000000..e646c3f69 --- /dev/null +++ b/src/main/java/org/opensearch/performanceanalyzer/rca/configs/SearchBackPressureRcaConfig.java @@ -0,0 +1,32 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ + +package org.opensearch.performanceanalyzer.rca.configs; + + +import org.opensearch.performanceanalyzer.rca.framework.core.RcaConf; + +public class SearchBackPressureRcaConfig { + public static final String CONFIG_NAME = "search-back-pressure-rca-policy"; + + // INTERVAL PERIOD IN SECONDS + public static final long DEFAULT_EVALUATION_INTERVAL_IN_S = 60; + + // Increase Threshold + // node max heap usage in last 60 secs is less than 70% + public static final int DEFAULT_MAX_HEAP_DOWNFLOW_THRESHOLD = 70; + + // cancellationCount due to heap is more than 50% of all task cancellations. + public static final int DEFAULT_MAX_HEAP_CANCELLATION_THRESHOLD = 50; + + // Decrease Threshold + // node min heap usage in last 60 secs is more than 80% + public static final int DEFAULT_MIN_HEAP_OVERFLOW_THRESHOLD = 80; + + // cancellationCount due to heap is more than 30% of all task cancellations + public static final int DEFAULT_MIN_HEAP_CANCELLATION_THRESHOLD = 30; + + public SearchBackPressureRcaConfig(final RcaConf conf) {} +} diff --git a/src/main/java/org/opensearch/performanceanalyzer/rca/framework/api/metrics/Searchbp_Stats.java b/src/main/java/org/opensearch/performanceanalyzer/rca/framework/api/metrics/Searchbp_Stats.java new file mode 100644 index 000000000..afe553da0 --- /dev/null +++ b/src/main/java/org/opensearch/performanceanalyzer/rca/framework/api/metrics/Searchbp_Stats.java @@ -0,0 +1,18 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ + +package org.opensearch.performanceanalyzer.rca.framework.api.metrics; + + +import org.opensearch.performanceanalyzer.commons.metrics.AllMetrics; +import org.opensearch.performanceanalyzer.rca.framework.api.Metric; + +public class Searchbp_Stats extends Metric { + public static final String NAME = AllMetrics.HeapValue.HEAP_USED.name(); + + public Heap_Used(long evaluationIntervalSeconds) { + super(AllMetrics.HeapValue.HEAP_USED.toString(), evaluationIntervalSeconds); + } +} diff --git a/src/main/java/org/opensearch/performanceanalyzer/rca/framework/core/RcaConf.java b/src/main/java/org/opensearch/performanceanalyzer/rca/framework/core/RcaConf.java index 4005e1c15..0ff06f0af 100644 --- a/src/main/java/org/opensearch/performanceanalyzer/rca/framework/core/RcaConf.java +++ b/src/main/java/org/opensearch/performanceanalyzer/rca/framework/core/RcaConf.java @@ -51,6 +51,7 @@ import org.opensearch.performanceanalyzer.rca.configs.HotShardRcaConfig; import org.opensearch.performanceanalyzer.rca.configs.OldGenContendedRcaConfig; import org.opensearch.performanceanalyzer.rca.configs.QueueRejectionRcaConfig; +import org.opensearch.performanceanalyzer.rca.configs.SearchBackPressureRcaConfig; import org.opensearch.performanceanalyzer.rca.configs.ShardRequestCacheRcaConfig; import org.opensearch.performanceanalyzer.rca.framework.api.summaries.bucket.BasicBucketCalculator; import org.opensearch.performanceanalyzer.rca.framework.api.summaries.bucket.BucketCalculator; @@ -232,6 +233,10 @@ public OldGenContendedRcaConfig getOldGenContendedRcaConfig() { return new OldGenContendedRcaConfig(this); } + public SearchBackPressureRcaConfig getSearchBackPressureRcaConfig() { + return new SearchBackPressureRcaConfig(this); + } + public T readRcaConfig( String rcaName, String key, T defaultValue, Class clazz) { return readRcaConfig(rcaName, key, defaultValue, (s) -> true, clazz); diff --git a/src/main/java/org/opensearch/performanceanalyzer/rca/store/OpenSearchAnalysisGraph.java b/src/main/java/org/opensearch/performanceanalyzer/rca/store/OpenSearchAnalysisGraph.java index e144f2ee1..c24ac1f47 100644 --- a/src/main/java/org/opensearch/performanceanalyzer/rca/store/OpenSearchAnalysisGraph.java +++ b/src/main/java/org/opensearch/performanceanalyzer/rca/store/OpenSearchAnalysisGraph.java @@ -85,6 +85,7 @@ import org.opensearch.performanceanalyzer.rca.store.rca.jvmsizing.LargeHeapClusterRca; import org.opensearch.performanceanalyzer.rca.store.rca.jvmsizing.OldGenContendedRca; import org.opensearch.performanceanalyzer.rca.store.rca.jvmsizing.OldGenReclamationRca; +import org.opensearch.performanceanalyzer.rca.store.rca.searchbackpressure.SearchBackPressureRCA; import org.opensearch.performanceanalyzer.rca.store.rca.temperature.NodeTemperatureRca; import org.opensearch.performanceanalyzer.rca.store.rca.temperature.dimension.CpuUtilDimensionTemperatureRca; import org.opensearch.performanceanalyzer.rca.store.rca.temperature.dimension.HeapAllocRateTemperatureRca; @@ -183,7 +184,8 @@ public void construct() { // Use EVALUATION_INTERVAL_SECONDS instead of RCA_PERIOD which resolved to 12 seconds. // This is resulting in this RCA not getting executed in every 5 seconds. Rca> threadMetricsRca = - new ThreadMetricsRca(threadBlockedTime, threadWaitedTime, EVALUATION_INTERVAL_SECONDS); + new ThreadMetricsRca( + threadBlockedTime, threadWaitedTime, EVALUATION_INTERVAL_SECONDS); threadMetricsRca.addTag( RcaConsts.RcaTagConstants.TAG_LOCUS, RcaConsts.RcaTagConstants.LOCUS_DATA_CLUSTER_MANAGER_NODE); @@ -432,6 +434,14 @@ public void construct() { shardRequestCacheClusterRca, highHeapUsageClusterRca)); + // Search Back Pressure Service RCA + final SearchBackPressureRCA searchBackPressureRCA = + new SearchBackPressureRCA(heapMax, heapUsed, gcType); + searchBackPressureRCA.addTag( + RcaConsts.RcaTagConstants.TAG_LOCUS, + RcaConsts.RcaTagConstants.LOCUS_DATA_CLUSTER_MANAGER_NODE); + searchBackPressureRCA.addAllUpstreams(Arrays.asList(heapMax, heapUsed, gcType)); + AdmissionControlDecider admissionControlDecider = buildAdmissionControlDecider(heapUsed, heapMax); diff --git a/src/main/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressurClusterRCA.java b/src/main/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressurClusterRCA.java new file mode 100644 index 000000000..b97d9c0cd --- /dev/null +++ b/src/main/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressurClusterRCA.java @@ -0,0 +1,22 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ + +package org.opensearch.performanceanalyzer.rca.store.rca.searchbackpressure; + + +import org.opensearch.performanceanalyzer.rca.framework.api.Rca; +import org.opensearch.performanceanalyzer.rca.framework.api.flow_units.ResourceFlowUnit; +import org.opensearch.performanceanalyzer.rca.framework.api.summaries.HotNodeSummary; +import org.opensearch.performanceanalyzer.rca.store.rca.cluster.BaseClusterRca; + +public class SearchBackPressurClusterRCA extends BaseClusterRca { + + public static final String RCA_TABLE_NAME = SearchBackPressurClusterRCA.class.getSimpleName(); + + public >> SearchBackPressurClusterRCA( + final int rcaPeriod, final R SearchBackPressureRCA) { + super(rcaPeriod, SearchBackPressureRCA); + } +} diff --git a/src/main/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressureRCA.java b/src/main/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressureRCA.java new file mode 100644 index 000000000..1937e0e0b --- /dev/null +++ b/src/main/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressureRCA.java @@ -0,0 +1,162 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ + +package org.opensearch.performanceanalyzer.rca.store.rca.searchbackpressure; + +import java.util.ArrayList; +import java.util.List; +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; +import org.jooq.Field; +import org.opensearch.performanceanalyzer.grpc.FlowUnitMessage; +import org.opensearch.performanceanalyzer.metricsdb.MetricsDB; +import org.opensearch.performanceanalyzer.rca.configs.SearchBackPressureRcaConfig; +import org.opensearch.performanceanalyzer.rca.framework.api.Metric; +import org.opensearch.performanceanalyzer.rca.framework.api.flow_units.MetricFlowUnit; +import org.opensearch.performanceanalyzer.rca.framework.api.flow_units.ResourceFlowUnit; +import org.opensearch.performanceanalyzer.rca.framework.api.summaries.HotNodeSummary; +import org.opensearch.performanceanalyzer.rca.framework.core.RcaConf; +import org.opensearch.performanceanalyzer.rca.scheduler.FlowUnitOperationArgWrapper; +import org.opensearch.performanceanalyzer.rca.store.rca.OldGenRca; +import static org.opensearch.performanceanalyzer.rca.framework.api.persist.SQLParsingUtil.readDataFromSqlResult; + +public class SearchBackPressureRCA extends OldGenRca> { + // LOGGER for SearchBackPressureRCA + private static final Logger LOG = LogManager.getLogger(SearchBackPressureRCA.class); + private static final double BYTES_TO_GIGABYTES = Math.pow(1024, 3); + private static final long EVAL_INTERVAL_IN_S = 5; + + // Key Metrics to be used to determine health status + // Task Level cancellationCount + // Shard Level cancellationCount + // Task level max heap usage + // Shard level max heap usage + // total node heap usage + private final Metric heapUsed; + // private final Metric SearchBPCancellationJVMPercentage; + + private long SearchBPCancellationJVMThreshold; + + // cases to incrase threshold + private long heapUsedIncreaseMaxThreshold; + private long heapCancellationIncreaseMaxThreshold; + + // case to decrease threshold + private long heapUsedDecreaseMinThreshold; + private long heapCancellationDecreaseMaxThreashold; + + // Period: 60s + + // track how many samples has been checked (only reach 60s (12 * 5s) to execute + // operate()) + private long counter; + + // key functions to be overriden + // operate(): determine whether to generate of flow unit of HEALTHY or UNHEALTHY + // readRcaConf(): read the key configuration metrics like heapMaxThreshold, + // heapMinThreshold, + // cancellationHeapPercentageThreshold + // counter to keep track of times of checking, as the default sliding window is + // 60 times, and + // interval for RCA scanning is 5s + // counter needs to be at least 12 to trigger operate(): 12 is the + // rcaSamplesBeforeEval + + // generateFlowUnitListFromWite() gets wireFlowUnits() (Do we need this?) + + // Not to be overriden but need to have + // read_cancellationcount_from_sql_shard + // read_cancellationcount_from_sql_task + // read_heapused_from_sql + // for heapused, simply call getOldGenUsedOrDefault() from OldGenRca.java + public SearchBackPressureRCA(final Metric heapMax, final Metric heapUsed, Metric gcType) { + super(EVAL_INTERVAL_IN_S, heapUsed, heapMax, null, gcType); + this.heapUsed = heapUsed; + this.heapUsedIncreaseMaxThreshold = + SearchBackPressureRcaConfig.DEFAULT_MAX_HEAP_DOWNFLOW_THRESHOLD; + this.heapCancellationIncreaseMaxThreshold = + SearchBackPressureRcaConfig.DEFAULT_MAX_HEAP_CANCELLATION_THRESHOLD; + this.heapUsedDecreaseMinThreshold = + SearchBackPressureRcaConfig.DEFAULT_MIN_HEAP_OVERFLOW_THRESHOLD; + this.heapCancellationDecreaseMaxThreashold = + SearchBackPressureRcaConfig.DEFAULT_MIN_HEAP_CANCELLATION_THRESHOLD; + + LOG.info("SearchBackPressureRCA initialized"); + } + + /* + * operate() is used for local build + * generateFlowUnitListFromWire simply use remote flowunits to + */ + @Override + public void generateFlowUnitListFromWire(FlowUnitOperationArgWrapper args) { + final List flowUnitMessages = + args.getWireHopper().readFromWire(args.getNode()); + final List> flowUnitList = new ArrayList<>(); + LOG.debug("rca: Executing fromWire: {}", this.getClass().getSimpleName()); + for (FlowUnitMessage flowUnitMessage : flowUnitMessages) { + flowUnitList.add(ResourceFlowUnit.buildFlowUnitFromWrapper(flowUnitMessage)); + } + setFlowUnits(flowUnitList); + } + + @Override + public ResourceFlowUnit operate() { + LOG.info("SearchBackPressureRCA operate() intiatilized"); + // Use OldGenRca.java to get heap usage and max heap size + double prevHeapUsage = getOldGenUsedOrDefault(0d); + double maxHeapSize = getMaxOldGenSizeOrDefault(Double.MAX_VALUE); + + double heapUsedPercentage = prevHeapUsage / maxHeapSize; + + // function to read cancellation count from sql + + // print out oldGenUsed and maxOldGen + LOG.info( + "SearchBackPressureRCA: oldGenUsed: {} maxOldGen: {}, heapUsedPercentage: {}", + prevHeapUsage, + maxHeapSize, + heapUsedPercentage); + LOG.info("SearchBackPressureRCA operate() finished"); + return null; + } + + private long getSearchBackPressureShardCancellationCount() { + getMetric(null, null, null) + return 0; + } + + private long getSearchBackPressureTaskCancellationCount() { + return 0; + } + + private double getMetric(M metric, Field field, String fieldName) { + double response = 0; + for (MetricFlowUnit flowUnit : metric.getFlowUnits()) { + if (!flowUnit.isEmpty()) { + double metricResponse = + readDataFromSqlResult(flowUnit.getData(), field, fieldName, MetricsDB.MAX); + if (!Double.isNaN(metricResponse) && metricResponse > 0) { + response = metricResponse; + } + } + } + return response; + } + + /** + * read threshold values from rca.conf + * + * @param conf RcaConf object + */ + @Override + public void readRcaConf(RcaConf conf) { + // only initialized one time + LOG.info("SearchBackPressureRCA readRcaConf() intiatilized"); + final SearchBackPressureRcaConfig config = conf.getSearchBackPressureRcaConfig(); + // read anything from config file in runtime + // if not just skip it + } +} From a296384f78a79513b85d944c8b8890d67cb6459e Mon Sep 17 00:00:00 2001 From: CoderJeffrey Date: Mon, 26 Jun 2023 22:14:06 -0700 Subject: [PATCH 06/73] Extract Heap Usage from SQlitedb (Signed-off-by: Jeffrey Liu ujeffliu@amazon.com) Signed-off-by: CoderJeffrey --- .../performanceanalyzer/rca/Version.java | 9 +++++--- .../framework/api/metrics/Searchbp_Stats.java | 8 +++---- .../rca/store/OpenSearchAnalysisGraph.java | 7 +++++++ .../SearchBackPressureRCA.java | 21 +++++++++++++++++-- .../reader/MetricsEmitter.java | 6 ++++-- 5 files changed, 40 insertions(+), 11 deletions(-) diff --git a/src/main/java/org/opensearch/performanceanalyzer/rca/Version.java b/src/main/java/org/opensearch/performanceanalyzer/rca/Version.java index bfc85fcd3..402013cf7 100644 --- a/src/main/java/org/opensearch/performanceanalyzer/rca/Version.java +++ b/src/main/java/org/opensearch/performanceanalyzer/rca/Version.java @@ -19,11 +19,14 @@ public final class Version { * transferred packets should be dropped. Every increment here should be accompanied with a line * describing the version bump. * - * Note: The RCA version is agnostic of OpenSearch version. + *

Note: The RCA version is agnostic of OpenSearch version. */ static final class Major { - // Bumping this post the Commons Lib(https://github.com/opensearch-project/performance-analyzer-commons/issues/2) - // and Service Metrics(https://github.com/opensearch-project/performance-analyzer-commons/issues/8) change + // Bumping this post the Commons + // Lib(https://github.com/opensearch-project/performance-analyzer-commons/issues/2) + // and Service + // Metrics(https://github.com/opensearch-project/performance-analyzer-commons/issues/8) + // change static final int RCA_MAJ_VERSION = 1; } diff --git a/src/main/java/org/opensearch/performanceanalyzer/rca/framework/api/metrics/Searchbp_Stats.java b/src/main/java/org/opensearch/performanceanalyzer/rca/framework/api/metrics/Searchbp_Stats.java index afe553da0..e655e4edc 100644 --- a/src/main/java/org/opensearch/performanceanalyzer/rca/framework/api/metrics/Searchbp_Stats.java +++ b/src/main/java/org/opensearch/performanceanalyzer/rca/framework/api/metrics/Searchbp_Stats.java @@ -10,9 +10,9 @@ import org.opensearch.performanceanalyzer.rca.framework.api.Metric; public class Searchbp_Stats extends Metric { - public static final String NAME = AllMetrics.HeapValue.HEAP_USED.name(); - - public Heap_Used(long evaluationIntervalSeconds) { - super(AllMetrics.HeapValue.HEAP_USED.toString(), evaluationIntervalSeconds); + public Searchbp_Stats(long evaluationIntervalSeconds) { + super( + AllMetrics.SearchBackPressureStatsValue.SEARCHBP_TABLE_NAME.toString(), + evaluationIntervalSeconds); } } diff --git a/src/main/java/org/opensearch/performanceanalyzer/rca/store/OpenSearchAnalysisGraph.java b/src/main/java/org/opensearch/performanceanalyzer/rca/store/OpenSearchAnalysisGraph.java index c24ac1f47..1cb4966af 100644 --- a/src/main/java/org/opensearch/performanceanalyzer/rca/store/OpenSearchAnalysisGraph.java +++ b/src/main/java/org/opensearch/performanceanalyzer/rca/store/OpenSearchAnalysisGraph.java @@ -118,6 +118,9 @@ public void construct() { MetricsDB.AVG, AllMetrics.CommonDimension.OPERATION.toString()); + // SearchBackpressure Metric + // Metric searchbp_Stats = new Searchbp_Stats(EVALUATION_INTERVAL_SECONDS); + heapUsed.addTag( RcaConsts.RcaTagConstants.TAG_LOCUS, RcaConsts.RcaTagConstants.LOCUS_DATA_CLUSTER_MANAGER_NODE); @@ -142,6 +145,9 @@ public void construct() { threadWaitedTime.addTag( RcaConsts.RcaTagConstants.TAG_LOCUS, RcaConsts.RcaTagConstants.LOCUS_DATA_CLUSTER_MANAGER_NODE); + // searchbp_Stats.addTag( + // RcaConsts.RcaTagConstants.TAG_LOCUS, + // RcaConsts.RcaTagConstants.LOCUS_DATA_CLUSTER_MANAGER_NODE); addLeaf(heapUsed); addLeaf(gcEvent); @@ -151,6 +157,7 @@ public void construct() { addLeaf(cpuUtilizationGroupByOperation); addLeaf(threadBlockedTime); addLeaf(threadWaitedTime); + // addLeaf(searchbp_Stats); // add node stats metrics List nodeStatsMetrics = constructNodeStatsMetrics(); diff --git a/src/main/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressureRCA.java b/src/main/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressureRCA.java index 1937e0e0b..9b3dec835 100644 --- a/src/main/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressureRCA.java +++ b/src/main/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressureRCA.java @@ -5,6 +5,8 @@ package org.opensearch.performanceanalyzer.rca.store.rca.searchbackpressure; +import static org.opensearch.performanceanalyzer.rca.framework.api.persist.SQLParsingUtil.readDataFromSqlResult; + import java.util.ArrayList; import java.util.List; import org.apache.logging.log4j.LogManager; @@ -20,7 +22,6 @@ import org.opensearch.performanceanalyzer.rca.framework.core.RcaConf; import org.opensearch.performanceanalyzer.rca.scheduler.FlowUnitOperationArgWrapper; import org.opensearch.performanceanalyzer.rca.store.rca.OldGenRca; -import static org.opensearch.performanceanalyzer.rca.framework.api.persist.SQLParsingUtil.readDataFromSqlResult; public class SearchBackPressureRCA extends OldGenRca> { // LOGGER for SearchBackPressureRCA @@ -35,6 +36,7 @@ public class SearchBackPressureRCA extends OldGenRca operate() { } private long getSearchBackPressureShardCancellationCount() { - getMetric(null, null, null) + // Use Searchbp_Stats metrics to get the metrics value + // Field shard_cancellation_count_field = + // DSL.field( + // DSL.name( + // AllMetrics.SearchBackPressureStatsValue.SEARCHBP_TYPE_DIM + // .toString()), + // String.class); + // double searchbpShardCancellationCount = + // getMetric(this.searchbp_Stats, shard_cancellation_count_field, "avg"); + + // LOG searchbpShardCancellationCount + // LOG.info( + // "SearchBackPressureRCA: searchbpShardCancellationCount: {}", + // searchbpShardCancellationCount); + return 0; } diff --git a/src/main/java/org/opensearch/performanceanalyzer/reader/MetricsEmitter.java b/src/main/java/org/opensearch/performanceanalyzer/reader/MetricsEmitter.java index b060fb346..fa7008748 100644 --- a/src/main/java/org/opensearch/performanceanalyzer/reader/MetricsEmitter.java +++ b/src/main/java/org/opensearch/performanceanalyzer/reader/MetricsEmitter.java @@ -756,8 +756,10 @@ public static void emitSearchBackPressureMetrics( Result searchbp_records = searchBackPressureMetricsSnapShot.fetchAll(); // String SEARCHBP_MODE_DIM = "searchbp_mode"; - String SEARCHBP_TYPE_DIM = "SearchBackPressureStats"; - String SEARCHBP_TABLE_NAME = "searchbp_stats"; + String SEARCHBP_TYPE_DIM = + AllMetrics.SearchBackPressureStatsValue.SEARCHBP_TYPE_DIM.toString(); + String SEARCHBP_TABLE_NAME = + AllMetrics.SearchBackPressureStatsValue.SEARCHBP_TABLE_NAME.toString(); List dims = new ArrayList() { From c44b9287517d7001df37b7b52babe437ecd7b0d2 Mon Sep 17 00:00:00 2001 From: CoderJeffrey Date: Mon, 26 Jun 2023 23:14:37 -0700 Subject: [PATCH 07/73] Extract required searchbp metrics for deciders (signed-off-by: Jeffrey Liu ujeffliu@amazon.com) Signed-off-by: CoderJeffrey --- .../rca/store/OpenSearchAnalysisGraph.java | 16 ++-- .../SearchBackPressureRCA.java | 77 +++++++++++++++---- 2 files changed, 69 insertions(+), 24 deletions(-) diff --git a/src/main/java/org/opensearch/performanceanalyzer/rca/store/OpenSearchAnalysisGraph.java b/src/main/java/org/opensearch/performanceanalyzer/rca/store/OpenSearchAnalysisGraph.java index 1cb4966af..5d86ca57c 100644 --- a/src/main/java/org/opensearch/performanceanalyzer/rca/store/OpenSearchAnalysisGraph.java +++ b/src/main/java/org/opensearch/performanceanalyzer/rca/store/OpenSearchAnalysisGraph.java @@ -41,6 +41,7 @@ import org.opensearch.performanceanalyzer.rca.framework.api.metrics.Heap_Max; import org.opensearch.performanceanalyzer.rca.framework.api.metrics.Heap_Used; import org.opensearch.performanceanalyzer.rca.framework.api.metrics.IndexWriter_Memory; +import org.opensearch.performanceanalyzer.rca.framework.api.metrics.Searchbp_Stats; import org.opensearch.performanceanalyzer.rca.framework.api.metrics.ThreadPool_QueueCapacity; import org.opensearch.performanceanalyzer.rca.framework.api.metrics.ThreadPool_RejectedReqs; import org.opensearch.performanceanalyzer.rca.framework.api.metrics.Thread_Blocked_Time; @@ -119,7 +120,7 @@ public void construct() { AllMetrics.CommonDimension.OPERATION.toString()); // SearchBackpressure Metric - // Metric searchbp_Stats = new Searchbp_Stats(EVALUATION_INTERVAL_SECONDS); + Metric searchbp_Stats = new Searchbp_Stats(EVALUATION_INTERVAL_SECONDS); heapUsed.addTag( RcaConsts.RcaTagConstants.TAG_LOCUS, @@ -145,9 +146,9 @@ public void construct() { threadWaitedTime.addTag( RcaConsts.RcaTagConstants.TAG_LOCUS, RcaConsts.RcaTagConstants.LOCUS_DATA_CLUSTER_MANAGER_NODE); - // searchbp_Stats.addTag( - // RcaConsts.RcaTagConstants.TAG_LOCUS, - // RcaConsts.RcaTagConstants.LOCUS_DATA_CLUSTER_MANAGER_NODE); + searchbp_Stats.addTag( + RcaConsts.RcaTagConstants.TAG_LOCUS, + RcaConsts.RcaTagConstants.LOCUS_DATA_CLUSTER_MANAGER_NODE); addLeaf(heapUsed); addLeaf(gcEvent); @@ -157,7 +158,7 @@ public void construct() { addLeaf(cpuUtilizationGroupByOperation); addLeaf(threadBlockedTime); addLeaf(threadWaitedTime); - // addLeaf(searchbp_Stats); + addLeaf(searchbp_Stats); // add node stats metrics List nodeStatsMetrics = constructNodeStatsMetrics(); @@ -443,11 +444,12 @@ public void construct() { // Search Back Pressure Service RCA final SearchBackPressureRCA searchBackPressureRCA = - new SearchBackPressureRCA(heapMax, heapUsed, gcType); + new SearchBackPressureRCA(heapMax, heapUsed, gcType, searchbp_Stats); searchBackPressureRCA.addTag( RcaConsts.RcaTagConstants.TAG_LOCUS, RcaConsts.RcaTagConstants.LOCUS_DATA_CLUSTER_MANAGER_NODE); - searchBackPressureRCA.addAllUpstreams(Arrays.asList(heapMax, heapUsed, gcType)); + searchBackPressureRCA.addAllUpstreams( + Arrays.asList(heapMax, heapUsed, gcType, searchbp_Stats)); AdmissionControlDecider admissionControlDecider = buildAdmissionControlDecider(heapUsed, heapMax); diff --git a/src/main/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressureRCA.java b/src/main/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressureRCA.java index 9b3dec835..f51570309 100644 --- a/src/main/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressureRCA.java +++ b/src/main/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressureRCA.java @@ -12,6 +12,8 @@ import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; import org.jooq.Field; +import org.jooq.impl.DSL; +import org.opensearch.performanceanalyzer.commons.metrics.AllMetrics; import org.opensearch.performanceanalyzer.grpc.FlowUnitMessage; import org.opensearch.performanceanalyzer.metricsdb.MetricsDB; import org.opensearch.performanceanalyzer.rca.configs.SearchBackPressureRcaConfig; @@ -36,8 +38,7 @@ public class SearchBackPressureRCA extends OldGenRca SearchBackPressureRCA( + final M heapMax, final M heapUsed, M gcType, M searchbp_Stats) { super(EVAL_INTERVAL_IN_S, heapUsed, heapMax, null, gcType); this.heapUsed = heapUsed; - // this.searchbp_Stats = new Searchbp_Stats(5); + this.searchbp_Stats = searchbp_Stats; this.heapUsedIncreaseMaxThreshold = SearchBackPressureRcaConfig.DEFAULT_MAX_HEAP_DOWNFLOW_THRESHOLD; this.heapCancellationIncreaseMaxThreshold = @@ -115,6 +117,7 @@ public ResourceFlowUnit operate() { double heapUsedPercentage = prevHeapUsage / maxHeapSize; // function to read cancellation count from sql + getSearchBackPressureShardCancellationCount(); // print out oldGenUsed and maxOldGen LOG.info( @@ -127,21 +130,61 @@ public ResourceFlowUnit operate() { } private long getSearchBackPressureShardCancellationCount() { + LOG.info("getSearchBackPressureShardCancellationCount() STARTED"); + // Use Searchbp_Stats metrics to get the metrics value - // Field shard_cancellation_count_field = - // DSL.field( - // DSL.name( - // AllMetrics.SearchBackPressureStatsValue.SEARCHBP_TYPE_DIM - // .toString()), - // String.class); - // double searchbpShardCancellationCount = - // getMetric(this.searchbp_Stats, shard_cancellation_count_field, "avg"); - - // LOG searchbpShardCancellationCount - // LOG.info( - // "SearchBackPressureRCA: searchbpShardCancellationCount: {}", - // searchbpShardCancellationCount); + // shard level cancellation count + Field searchbp_stats_type_field = + DSL.field( + DSL.name( + AllMetrics.SearchBackPressureStatsValue.SEARCHBP_TYPE_DIM + .toString()), + String.class); + + double searchbpShardCancellationCount = + getMetric( + this.searchbp_Stats, + searchbp_stats_type_field, + AllMetrics.SearchBackPressureStatsValue + .SEARCHBP_SHARD_STATS_CANCELLATIONCOUNT + .toString()); + double searchbpTaskCancellationCount = + getMetric( + this.searchbp_Stats, + searchbp_stats_type_field, + AllMetrics.SearchBackPressureStatsValue + .SEARCHBP_TASK_STATS_CANCELLATIONCOUNT + .toString()); + double searchbpJVMShardCancellationCount = + getMetric( + this.searchbp_Stats, + searchbp_stats_type_field, + AllMetrics.SearchBackPressureStatsValue + .SEARCHBP_SHARD_STATS_RESOURCE_HEAP_USAGE_CANCELLATIONCOUNT + .toString()); + double searchbpJVMTaskCancellationCount = + getMetric( + this.searchbp_Stats, + searchbp_stats_type_field, + AllMetrics.SearchBackPressureStatsValue + .SEARCHBP_TASK_STATS_RESOURCE_HEAP_USAGE_CANCELLATIONCOUNT + .toString()); + LOG.info( + "SearchBackPressureRCA: searchbpShardCancellationCount: {}", + searchbpShardCancellationCount); + // print out searchbpTaskCancellationCount, searchbpJVMShardCancellationCount, + // searchbpJVMTaskCancellationCount + LOG.info( + "SearchBackPressureRCA: searchbpTaskCancellationCount: {}", + searchbpTaskCancellationCount); + LOG.info( + "SearchBackPressureRCA: searchbpJVMShardCancellationCount: {}", + searchbpJVMShardCancellationCount); + LOG.info( + "SearchBackPressureRCA: searchbpJVMTaskCancellationCount: {}", + searchbpJVMTaskCancellationCount); + LOG.info("getSearchBackPressureShardCancellationCount() finished"); return 0; } From c1e957db5c4778450f32d5f1fd22258cfd28d713 Mon Sep 17 00:00:00 2001 From: CoderJeffrey Date: Tue, 27 Jun 2023 16:14:08 -0700 Subject: [PATCH 08/73] Add SearchBackPressureRCA Metric (Signed-off-by: Jeffrey Liu ujeffliu@amazon.com) Signed-off-by: CoderJeffrey --- .../model/SearchBackPressureRCAMetric.java | 83 +++++++++++++++++++ 1 file changed, 83 insertions(+) create mode 100644 src/main/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/model/SearchBackPressureRCAMetric.java diff --git a/src/main/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/model/SearchBackPressureRCAMetric.java b/src/main/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/model/SearchBackPressureRCAMetric.java new file mode 100644 index 000000000..6ef8b6eae --- /dev/null +++ b/src/main/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/model/SearchBackPressureRCAMetric.java @@ -0,0 +1,83 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ + +package org.opensearch.performanceanalyzer.rca.store.rca.searchbackpressure.model; + +/** Represents used heap and max heap in gigabytes */ +public class SearchBackPressureRCAMetric { + private final double usedHeap; + private final double maxHeap; + private final double searchbpShardCancellationCount; + private final double searchbpTaskCancellationCount; + private final double searchbpJVMShardCancellationCount; + private final double searchbpJVMTaskCancellationCount; + + // Constructor + public SearchBackPressureRCAMetric(double usedHeap, double maxHeap, double searchbpShardCancellationCount, + double searchbpTaskCancellationCount, double searchbpJVMShardCancellationCount, + double searchbpJVMTaskCancellationCount) { + this.usedHeap = usedHeap; + this.maxHeap = maxHeap; + this.searchbpShardCancellationCount = searchbpShardCancellationCount; + this.searchbpTaskCancellationCount = searchbpTaskCancellationCount; + this.searchbpJVMShardCancellationCount = searchbpJVMShardCancellationCount; + this.searchbpJVMTaskCancellationCount = searchbpJVMTaskCancellationCount; + } + + // Getters + public double getUsedHeap() { + return usedHeap; + } + + public double getMaxHeap() { + return maxHeap; + } + + public double getSearchbpShardCancellationCount() { + return searchbpShardCancellationCount; + } + + public double getSearchbpTaskCancellationCount() { + return searchbpTaskCancellationCount; + } + + public double getSearchbpJVMShardCancellationCount() { + return searchbpJVMShardCancellationCount; + } + + public double getSearchbpJVMTaskCancellationCount() { + return searchbpJVMTaskCancellationCount; + } + + public double getHeapUsagePercent() { + if (this.getMaxHeap() == 0) { + return 0; + } + return 100 * this.getUsedHeap() / this.getMaxHeap(); + } + + public double getShardJVMCancellationPercent() { + if (this.getSearchbpShardCancellationCount() == 0) { + return 0; + } + return 100 * this.getSearchbpJVMShardCancellationCount() / this.getSearchbpShardCancellationCount(); + } + + public double getTaskJVMCancellationPercent() { + if (this.getSearchbpTaskCancellationCount() == 0) { + return 0; + } + return 100 * this.getSearchbpJVMTaskCancellationCount() / this.getSearchbpTaskCancellationCount(); + } + + public boolean hasValues() { + return this.getUsedHeap() != 0 && this.getMaxHeap() != 0; + } + + @Override + public String toString() { + return "HeapMetric{" + "usedHeap=" + usedHeap + ", maxHeap=" + maxHeap + '}'; + } +} From 55e5cdde7738005d25f60e2e30cf8b4ba959dee5 Mon Sep 17 00:00:00 2001 From: CoderJeffrey Date: Tue, 27 Jun 2023 17:11:38 -0700 Subject: [PATCH 09/73] Use SearchBackPressureRCAMetrics to aggregate metrics (signed-off-by: Jeffrey Liu ujeffliu@amazon.com) Signed-off-by: CoderJeffrey --- .../configs/SearchBackPressureRcaConfig.java | 2 + .../rca/store/OpenSearchAnalysisGraph.java | 2 +- .../SearchBackPressureRCA.java | 172 +++++++++++++----- .../model/SearchBackPressureRCAMetric.java | 31 +++- 4 files changed, 156 insertions(+), 51 deletions(-) diff --git a/src/main/java/org/opensearch/performanceanalyzer/rca/configs/SearchBackPressureRcaConfig.java b/src/main/java/org/opensearch/performanceanalyzer/rca/configs/SearchBackPressureRcaConfig.java index e646c3f69..f460f16d1 100644 --- a/src/main/java/org/opensearch/performanceanalyzer/rca/configs/SearchBackPressureRcaConfig.java +++ b/src/main/java/org/opensearch/performanceanalyzer/rca/configs/SearchBackPressureRcaConfig.java @@ -29,4 +29,6 @@ public class SearchBackPressureRcaConfig { public static final int DEFAULT_MIN_HEAP_CANCELLATION_THRESHOLD = 30; public SearchBackPressureRcaConfig(final RcaConf conf) {} + + // conf file to get Runtime Threshold for SearchBackPressureRCAConfig (TODO) } diff --git a/src/main/java/org/opensearch/performanceanalyzer/rca/store/OpenSearchAnalysisGraph.java b/src/main/java/org/opensearch/performanceanalyzer/rca/store/OpenSearchAnalysisGraph.java index 5d86ca57c..bae2c74cf 100644 --- a/src/main/java/org/opensearch/performanceanalyzer/rca/store/OpenSearchAnalysisGraph.java +++ b/src/main/java/org/opensearch/performanceanalyzer/rca/store/OpenSearchAnalysisGraph.java @@ -444,7 +444,7 @@ public void construct() { // Search Back Pressure Service RCA final SearchBackPressureRCA searchBackPressureRCA = - new SearchBackPressureRCA(heapMax, heapUsed, gcType, searchbp_Stats); + new SearchBackPressureRCA(RCA_PERIOD, heapMax, heapUsed, gcType, searchbp_Stats); searchBackPressureRCA.addTag( RcaConsts.RcaTagConstants.TAG_LOCUS, RcaConsts.RcaTagConstants.LOCUS_DATA_CLUSTER_MANAGER_NODE); diff --git a/src/main/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressureRCA.java b/src/main/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressureRCA.java index f51570309..f510112d6 100644 --- a/src/main/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressureRCA.java +++ b/src/main/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressureRCA.java @@ -7,8 +7,10 @@ import static org.opensearch.performanceanalyzer.rca.framework.api.persist.SQLParsingUtil.readDataFromSqlResult; +import java.time.Clock; import java.util.ArrayList; import java.util.List; +import java.util.concurrent.TimeUnit; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; import org.jooq.Field; @@ -18,12 +20,16 @@ import org.opensearch.performanceanalyzer.metricsdb.MetricsDB; import org.opensearch.performanceanalyzer.rca.configs.SearchBackPressureRcaConfig; import org.opensearch.performanceanalyzer.rca.framework.api.Metric; +import org.opensearch.performanceanalyzer.rca.framework.api.aggregators.SlidingWindow; +import org.opensearch.performanceanalyzer.rca.framework.api.aggregators.SlidingWindowData; +import org.opensearch.performanceanalyzer.rca.framework.api.contexts.ResourceContext; import org.opensearch.performanceanalyzer.rca.framework.api.flow_units.MetricFlowUnit; import org.opensearch.performanceanalyzer.rca.framework.api.flow_units.ResourceFlowUnit; import org.opensearch.performanceanalyzer.rca.framework.api.summaries.HotNodeSummary; import org.opensearch.performanceanalyzer.rca.framework.core.RcaConf; import org.opensearch.performanceanalyzer.rca.scheduler.FlowUnitOperationArgWrapper; import org.opensearch.performanceanalyzer.rca.store.rca.OldGenRca; +import org.opensearch.performanceanalyzer.rca.store.rca.searchbackpressure.model.SearchBackPressureRCAMetric; public class SearchBackPressureRCA extends OldGenRca> { // LOGGER for SearchBackPressureRCA @@ -32,30 +38,46 @@ public class SearchBackPressureRCA extends OldGenRca taskJVMCancellationSlidingWindow; + private final SlidingWindow shardJVMCancellationSlidingWindow; + private final SlidingWindow heapUsageSlidingWindow; + + // Sliding Window Interval + private static final int SLIDING_WINDOW_SIZE_IN_MINS = 1; + private static final int SLIDING_WINDOW_SIZE_IN_SECS = SLIDING_WINDOW_SIZE_IN_MINS * 60; - // track how many samples has been checked (only reach 60s (12 * 5s) to execute - // operate()) + // counter to check the samples has been taken, only emit flow units when counter equals to + // rcaPeriod private long counter; + // Required amount of RCA period this RCA needs to run before sending out a flowunit + private final int rcaPeriod; + + // Current time + protected Clock clock; + // key functions to be overriden // operate(): determine whether to generate of flow unit of HEALTHY or UNHEALTHY // readRcaConf(): read the key configuration metrics like heapMaxThreshold, @@ -75,9 +97,11 @@ public class SearchBackPressureRCA extends OldGenRca SearchBackPressureRCA( - final M heapMax, final M heapUsed, M gcType, M searchbp_Stats) { + final int rcaPeriod, final M heapMax, final M heapUsed, M gcType, M searchbp_Stats) { super(EVAL_INTERVAL_IN_S, heapUsed, heapMax, null, gcType); this.heapUsed = heapUsed; + this.rcaPeriod = rcaPeriod; + this.clock = Clock.systemUTC(); this.searchbp_Stats = searchbp_Stats; this.heapUsedIncreaseMaxThreshold = SearchBackPressureRcaConfig.DEFAULT_MAX_HEAP_DOWNFLOW_THRESHOLD; @@ -88,6 +112,14 @@ public SearchBackPressureRCA( this.heapCancellationDecreaseMaxThreashold = SearchBackPressureRcaConfig.DEFAULT_MIN_HEAP_CANCELLATION_THRESHOLD; + // initialize sliding window + this.heapUsageSlidingWindow = + new SlidingWindow<>(SLIDING_WINDOW_SIZE_IN_MINS, TimeUnit.MINUTES); + this.shardJVMCancellationSlidingWindow = + new SlidingWindow<>(SLIDING_WINDOW_SIZE_IN_MINS, TimeUnit.MINUTES); + this.taskJVMCancellationSlidingWindow = + new SlidingWindow<>(SLIDING_WINDOW_SIZE_IN_MINS, TimeUnit.MINUTES); + LOG.info("SearchBackPressureRCA initialized"); } @@ -110,30 +142,93 @@ public void generateFlowUnitListFromWire(FlowUnitOperationArgWrapper args) { @Override public ResourceFlowUnit operate() { LOG.info("SearchBackPressureRCA operate() intiatilized"); - // Use OldGenRca.java to get heap usage and max heap size - double prevHeapUsage = getOldGenUsedOrDefault(0d); - double maxHeapSize = getMaxOldGenSizeOrDefault(Double.MAX_VALUE); + counter += 1; - double heapUsedPercentage = prevHeapUsage / maxHeapSize; + long currTimeStamp = this.clock.millis(); - // function to read cancellation count from sql - getSearchBackPressureShardCancellationCount(); + // read key metrics into searchBackPressureRCAMetric for easier management + SearchBackPressureRCAMetric searchBackPressureRCAMetric = getSearchBackPressureRCAMetric(); // print out oldGenUsed and maxOldGen LOG.info( - "SearchBackPressureRCA: oldGenUsed: {} maxOldGen: {}, heapUsedPercentage: {}", - prevHeapUsage, - maxHeapSize, - heapUsedPercentage); + "SearchBackPressureRCA: oldGenUsed: {} maxOldGen: {}, heapUsedPercentage: {}, searchbpShardCancellationCount: {}, searchbpTaskCancellationCount: {}, searchbpJVMShardCancellationCount: {}, searchbpJVMTaskCancellationCount: {}", + searchBackPressureRCAMetric.getUsedHeap(), + searchBackPressureRCAMetric.getMaxHeap(), + searchBackPressureRCAMetric.getHeapUsagePercent(), + searchBackPressureRCAMetric.getSearchbpShardCancellationCount(), + searchBackPressureRCAMetric.getSearchbpTaskCancellationCount(), + searchBackPressureRCAMetric.getSearchbpJVMShardCancellationCount(), + searchBackPressureRCAMetric.getSearchbpJVMTaskCancellationCount()); + + // update sliding window if the value is NOT NaN + double prevheapUsagePercentage = searchBackPressureRCAMetric.getHeapUsagePercent(); + if (!Double.isNaN(prevheapUsagePercentage)) { + heapUsageSlidingWindow.next( + new SlidingWindowData(currTimeStamp, prevheapUsagePercentage)); + } + + double shardJVMCancellationPercentage = + searchBackPressureRCAMetric.getShardJVMCancellationPercent(); + if (!Double.isNaN(shardJVMCancellationPercentage)) { + shardJVMCancellationSlidingWindow.next( + new SlidingWindowData(currTimeStamp, shardJVMCancellationPercentage)); + } + + double taskJVMCancellationPercentage = + searchBackPressureRCAMetric.getTaskJVMCancellationPercent(); + if (!Double.isNaN(taskJVMCancellationPercentage)) { + taskJVMCancellationSlidingWindow.next( + new SlidingWindowData(currTimeStamp, taskJVMCancellationPercentage)); + } + + LOG.info("SearchBackPressureRCA counter is {}", counter); + // if counter matches the rca period, emit the flow unit + if (counter == this.rcaPeriod) { + ResourceContext context = null; + LOG.info("SearchBackPressureRCA counter in rcaPeriod is {}", counter); + counter = 0; + + // TODO change to + double maxHeapUsagePercentage = heapUsageSlidingWindow.readAvg(); + double avgShardJVMCancellationPercentage = shardJVMCancellationSlidingWindow.readAvg(); + double avgTaskJVMCancellationPercentage = taskJVMCancellationSlidingWindow.readAvg(); + LOG.info( + "SearchBackPressureRCA: maxHeapUsagePercentage: {}, SearchBackPressureRCA: maxHeapUsagePercentage: {}, SearchBackPressureRCA: maxHeapUsagePercentage: {}", + maxHeapUsagePercentage, + avgShardJVMCancellationPercentage, + avgTaskJVMCancellationPercentage); + + // get the Configured Threshold and compare with Sliding Window Stats + if (maxHeapUsagePercentage > heapUsedDecreaseMinThreshold) { + // Generate a flow unit with an Unhealthy ResourceContext + LOG.info( + "maxHeapUsagePercentage: {} is greater than threshold: {}", + maxHeapUsagePercentage, + heapUsedDecreaseMinThreshold); + + } else { + // Generate a flow unit with a Healthy ResourceContext + LOG.info( + "maxHeapUsagePercentage: {} is less than threshold: {}", + maxHeapUsagePercentage, + heapUsedDecreaseMinThreshold); + } + + } else { + LOG.info("Empty FlowUnit returned for High Heap Usage RCA"); + return new ResourceFlowUnit<>(this.clock.millis()); + } + LOG.info("SearchBackPressureRCA operate() finished"); return null; } - private long getSearchBackPressureShardCancellationCount() { - LOG.info("getSearchBackPressureShardCancellationCount() STARTED"); + private SearchBackPressureRCAMetric getSearchBackPressureRCAMetric() { + // Get Heap Usage related metrics + double prevHeapUsage = getOldGenUsedOrDefault(0d); + double maxHeapSize = getMaxOldGenSizeOrDefault(Double.MAX_VALUE); - // Use Searchbp_Stats metrics to get the metrics value - // shard level cancellation count + // Get SearchBack Pressure related metrics from stats type field Field searchbp_stats_type_field = DSL.field( DSL.name( @@ -169,27 +264,14 @@ private long getSearchBackPressureShardCancellationCount() { AllMetrics.SearchBackPressureStatsValue .SEARCHBP_TASK_STATS_RESOURCE_HEAP_USAGE_CANCELLATIONCOUNT .toString()); - LOG.info( - "SearchBackPressureRCA: searchbpShardCancellationCount: {}", - searchbpShardCancellationCount); - // print out searchbpTaskCancellationCount, searchbpJVMShardCancellationCount, - // searchbpJVMTaskCancellationCount - LOG.info( - "SearchBackPressureRCA: searchbpTaskCancellationCount: {}", - searchbpTaskCancellationCount); - LOG.info( - "SearchBackPressureRCA: searchbpJVMShardCancellationCount: {}", - searchbpJVMShardCancellationCount); - LOG.info( - "SearchBackPressureRCA: searchbpJVMTaskCancellationCount: {}", - searchbpJVMTaskCancellationCount); - LOG.info("getSearchBackPressureShardCancellationCount() finished"); - return 0; - } - - private long getSearchBackPressureTaskCancellationCount() { - return 0; + return new SearchBackPressureRCAMetric( + prevHeapUsage, + maxHeapSize, + searchbpShardCancellationCount, + searchbpTaskCancellationCount, + searchbpJVMShardCancellationCount, + searchbpJVMTaskCancellationCount); } private double getMetric(M metric, Field field, String fieldName) { diff --git a/src/main/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/model/SearchBackPressureRCAMetric.java b/src/main/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/model/SearchBackPressureRCAMetric.java index 6ef8b6eae..718c76b8f 100644 --- a/src/main/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/model/SearchBackPressureRCAMetric.java +++ b/src/main/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/model/SearchBackPressureRCAMetric.java @@ -15,8 +15,12 @@ public class SearchBackPressureRCAMetric { private final double searchbpJVMTaskCancellationCount; // Constructor - public SearchBackPressureRCAMetric(double usedHeap, double maxHeap, double searchbpShardCancellationCount, - double searchbpTaskCancellationCount, double searchbpJVMShardCancellationCount, + public SearchBackPressureRCAMetric( + double usedHeap, + double maxHeap, + double searchbpShardCancellationCount, + double searchbpTaskCancellationCount, + double searchbpJVMShardCancellationCount, double searchbpJVMTaskCancellationCount) { this.usedHeap = usedHeap; this.maxHeap = maxHeap; @@ -62,14 +66,18 @@ public double getShardJVMCancellationPercent() { if (this.getSearchbpShardCancellationCount() == 0) { return 0; } - return 100 * this.getSearchbpJVMShardCancellationCount() / this.getSearchbpShardCancellationCount(); + return 100 + * this.getSearchbpJVMShardCancellationCount() + / this.getSearchbpShardCancellationCount(); } public double getTaskJVMCancellationPercent() { if (this.getSearchbpTaskCancellationCount() == 0) { return 0; } - return 100 * this.getSearchbpJVMTaskCancellationCount() / this.getSearchbpTaskCancellationCount(); + return 100 + * this.getSearchbpJVMTaskCancellationCount() + / this.getSearchbpTaskCancellationCount(); } public boolean hasValues() { @@ -78,6 +86,19 @@ public boolean hasValues() { @Override public String toString() { - return "HeapMetric{" + "usedHeap=" + usedHeap + ", maxHeap=" + maxHeap + '}'; + return "HeapMetric{" + + "usedHeap=" + + usedHeap + + ", maxHeap=" + + maxHeap + + ", searchbpShardCancellationCount=" + + searchbpShardCancellationCount + + ", searchbpTaskCancellationCount=" + + searchbpTaskCancellationCount + + ", searchbpJVMShardCancellationCount=" + + searchbpJVMShardCancellationCount + + ", searchbpJVMTaskCancellationCount=" + + searchbpJVMTaskCancellationCount + + '}'; } } From 28980608ca5b91229309eecc27479410b070a2a9 Mon Sep 17 00:00:00 2001 From: CoderJeffrey Date: Tue, 27 Jun 2023 17:36:31 -0700 Subject: [PATCH 10/73] Add the conf file extracted part for SearchBackPressureRcaConfig.java (signed-off-by: Jeffrey Liu ujeffliu@amazon.com) Signed-off-by: CoderJeffrey --- .../configs/SearchBackPressureRcaConfig.java | 72 ++++++++++++++++--- 1 file changed, 64 insertions(+), 8 deletions(-) diff --git a/src/main/java/org/opensearch/performanceanalyzer/rca/configs/SearchBackPressureRcaConfig.java b/src/main/java/org/opensearch/performanceanalyzer/rca/configs/SearchBackPressureRcaConfig.java index f460f16d1..86ee11848 100644 --- a/src/main/java/org/opensearch/performanceanalyzer/rca/configs/SearchBackPressureRcaConfig.java +++ b/src/main/java/org/opensearch/performanceanalyzer/rca/configs/SearchBackPressureRcaConfig.java @@ -4,31 +4,87 @@ */ package org.opensearch.performanceanalyzer.rca.configs; - - import org.opensearch.performanceanalyzer.rca.framework.core.RcaConf; public class SearchBackPressureRcaConfig { public static final String CONFIG_NAME = "search-back-pressure-rca-policy"; - // INTERVAL PERIOD IN SECONDS + // Interval period in seconds public static final long DEFAULT_EVALUATION_INTERVAL_IN_S = 60; // Increase Threshold // node max heap usage in last 60 secs is less than 70% - public static final int DEFAULT_MAX_HEAP_DOWNFLOW_THRESHOLD = 70; + public static final int DEFAULT_MAX_HEAP_INCREASE_THRESHOLD = 70; + private Integer maxHeapIncreasePercentageThreshold; // cancellationCount due to heap is more than 50% of all task cancellations. public static final int DEFAULT_MAX_HEAP_CANCELLATION_THRESHOLD = 50; + private Integer maxHeapCancellationPercentageThreshold; // Decrease Threshold // node min heap usage in last 60 secs is more than 80% - public static final int DEFAULT_MIN_HEAP_OVERFLOW_THRESHOLD = 80; + public static final int DEFAULT_MAX_HEAP_DECREASE_THRESHOLD = 80; + private Integer maxHeapDecreasePercentageThreshold; - // cancellationCount due to heap is more than 30% of all task cancellations + // cancellationCount due to heap is less than 30% of all task cancellations public static final int DEFAULT_MIN_HEAP_CANCELLATION_THRESHOLD = 30; + private Integer minHeapCancellationPercentageThreshold; + + public SearchBackPressureRcaConfig(final RcaConf conf) { + // (s) -> s > 0 is the validator, if validated, fields from conf file will be returned, else, default value gets returned + maxHeapIncreasePercentageThreshold = + conf.readRcaConfig( + CONFIG_NAME, + RCA_CONF_KEY_CONSTANTS.MAX_HEAP_USAGE_INCREASE_FIELD, + DEFAULT_MAX_HEAP_INCREASE_THRESHOLD, + (s) -> s >= 0 && s <= 100, + Integer.class); + maxHeapCancellationPercentageThreshold = + conf.readRcaConfig( + CONFIG_NAME, + RCA_CONF_KEY_CONSTANTS.MAX_HEAP_CANCELLATION_PERCENTAGE_FIELD, + DEFAULT_MAX_HEAP_CANCELLATION_THRESHOLD, + (s) -> s >= 0 && s <= 100, + Integer.class); + maxHeapDecreasePercentageThreshold = + conf.readRcaConfig( + CONFIG_NAME, + RCA_CONF_KEY_CONSTANTS.MAX_HEAP_USAGE_DECREASE_FIELD, + DEFAULT_MAX_HEAP_DECREASE_THRESHOLD, + (s) -> s >= 0 && s <= 100, + Integer.class); + minHeapCancellationPercentageThreshold = + conf.readRcaConfig( + CONFIG_NAME, + RCA_CONF_KEY_CONSTANTS.MIN_HEAP_CANCELLATION_PERCENTAGE_FIELD, + DEFAULT_MIN_HEAP_CANCELLATION_THRESHOLD, + (s) -> s >= 0 && s <= 100, + Integer.class); + } + + // Getters for private field + public int getMaxHeapIncreasePercentageThreshold() { + return maxHeapIncreasePercentageThreshold; + } + + public int getMaxHeapCancellationPercentageThreshold() { + return maxHeapCancellationPercentageThreshold; + } + + public int getMaxHeapDecreasePercentageThreshold() { + return maxHeapDecreasePercentageThreshold; + } - public SearchBackPressureRcaConfig(final RcaConf conf) {} + public int getMinHeapCancellationPercentageThreshold() { + return minHeapCancellationPercentageThreshold; + } - // conf file to get Runtime Threshold for SearchBackPressureRCAConfig (TODO) + // name for the configuration field + public static class RCA_CONF_KEY_CONSTANTS { + public static final String MAX_HEAP_USAGE_INCREASE_FIELD = "max-heap-usage-increase"; + public static final String MAX_HEAP_CANCELLATION_PERCENTAGE_FIELD = "max-heap-cancellation-percentage"; + public static final String MAX_HEAP_USAGE_DECREASE_FIELD = "max-heap-usage-decrease"; + public static final String MIN_HEAP_CANCELLATION_PERCENTAGE_FIELD = "min-heap-cancellation-percentage"; + } } + \ No newline at end of file From 48c92fbd1f824cf46b24e0666d3130b47f47e334 Mon Sep 17 00:00:00 2001 From: CoderJeffrey Date: Wed, 28 Jun 2023 10:28:21 -0700 Subject: [PATCH 11/73] Add MinMaxSlidingWindow in OldGenRca (Signed-off-by: Jeffrey Liu ujeffliu@amazon.com) Signed-off-by: CoderJeffrey --- .../configs/SearchBackPressureRcaConfig.java | 12 ++-- .../rca/store/rca/OldGenRca.java | 32 +++++++++ .../SearchBackPressureRCA.java | 66 ++++++++++++------- 3 files changed, 84 insertions(+), 26 deletions(-) diff --git a/src/main/java/org/opensearch/performanceanalyzer/rca/configs/SearchBackPressureRcaConfig.java b/src/main/java/org/opensearch/performanceanalyzer/rca/configs/SearchBackPressureRcaConfig.java index 86ee11848..a71e74da8 100644 --- a/src/main/java/org/opensearch/performanceanalyzer/rca/configs/SearchBackPressureRcaConfig.java +++ b/src/main/java/org/opensearch/performanceanalyzer/rca/configs/SearchBackPressureRcaConfig.java @@ -4,6 +4,8 @@ */ package org.opensearch.performanceanalyzer.rca.configs; + + import org.opensearch.performanceanalyzer.rca.framework.core.RcaConf; public class SearchBackPressureRcaConfig { @@ -31,7 +33,8 @@ public class SearchBackPressureRcaConfig { private Integer minHeapCancellationPercentageThreshold; public SearchBackPressureRcaConfig(final RcaConf conf) { - // (s) -> s > 0 is the validator, if validated, fields from conf file will be returned, else, default value gets returned + // (s) -> s > 0 is the validator, if validated, fields from conf file will be returned, + // else, default value gets returned maxHeapIncreasePercentageThreshold = conf.readRcaConfig( CONFIG_NAME, @@ -82,9 +85,10 @@ public int getMinHeapCancellationPercentageThreshold() { // name for the configuration field public static class RCA_CONF_KEY_CONSTANTS { public static final String MAX_HEAP_USAGE_INCREASE_FIELD = "max-heap-usage-increase"; - public static final String MAX_HEAP_CANCELLATION_PERCENTAGE_FIELD = "max-heap-cancellation-percentage"; + public static final String MAX_HEAP_CANCELLATION_PERCENTAGE_FIELD = + "max-heap-cancellation-percentage"; public static final String MAX_HEAP_USAGE_DECREASE_FIELD = "max-heap-usage-decrease"; - public static final String MIN_HEAP_CANCELLATION_PERCENTAGE_FIELD = "min-heap-cancellation-percentage"; + public static final String MIN_HEAP_CANCELLATION_PERCENTAGE_FIELD = + "min-heap-cancellation-percentage"; } } - \ No newline at end of file diff --git a/src/main/java/org/opensearch/performanceanalyzer/rca/store/rca/OldGenRca.java b/src/main/java/org/opensearch/performanceanalyzer/rca/store/rca/OldGenRca.java index fc3558339..a53c2b7bf 100644 --- a/src/main/java/org/opensearch/performanceanalyzer/rca/store/rca/OldGenRca.java +++ b/src/main/java/org/opensearch/performanceanalyzer/rca/store/rca/OldGenRca.java @@ -250,4 +250,36 @@ public double readMin() { return Double.NaN; } } + + /** + * Sliding window to check the max/min olg gen usage within a given time frame Previous + * MinGoldGenSlidingWindow should be deprecated since it modify the sliding window size in + * next() + */ + public static class MinMaxOldGenSlidingWindow extends SlidingWindow { + + public MinMaxOldGenSlidingWindow(int SLIDING_WINDOW_SIZE_IN_TIMESTAMP, TimeUnit timeUnit) { + super(SLIDING_WINDOW_SIZE_IN_TIMESTAMP, timeUnit); + } + + public double readMax() { + if (!windowDeque.isEmpty()) { + return windowDeque.stream() + .mapToDouble(SlidingWindowData::getValue) + .max() + .orElse(Double.NaN); + } + return Double.NaN; + } + + public double readMin() { + if (!windowDeque.isEmpty()) { + return windowDeque.stream() + .mapToDouble(SlidingWindowData::getValue) + .min() + .orElse(Double.NaN); + } + return Double.NaN; + } + } } diff --git a/src/main/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressureRCA.java b/src/main/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressureRCA.java index f510112d6..9624b2088 100644 --- a/src/main/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressureRCA.java +++ b/src/main/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressureRCA.java @@ -20,6 +20,7 @@ import org.opensearch.performanceanalyzer.metricsdb.MetricsDB; import org.opensearch.performanceanalyzer.rca.configs.SearchBackPressureRcaConfig; import org.opensearch.performanceanalyzer.rca.framework.api.Metric; +import org.opensearch.performanceanalyzer.rca.framework.api.Resources; import org.opensearch.performanceanalyzer.rca.framework.api.aggregators.SlidingWindow; import org.opensearch.performanceanalyzer.rca.framework.api.aggregators.SlidingWindowData; import org.opensearch.performanceanalyzer.rca.framework.api.contexts.ResourceContext; @@ -27,6 +28,7 @@ import org.opensearch.performanceanalyzer.rca.framework.api.flow_units.ResourceFlowUnit; import org.opensearch.performanceanalyzer.rca.framework.api.summaries.HotNodeSummary; import org.opensearch.performanceanalyzer.rca.framework.core.RcaConf; +import org.opensearch.performanceanalyzer.rca.framework.util.InstanceDetails; import org.opensearch.performanceanalyzer.rca.scheduler.FlowUnitOperationArgWrapper; import org.opensearch.performanceanalyzer.rca.store.rca.OldGenRca; import org.opensearch.performanceanalyzer.rca.store.rca.searchbackpressure.model.SearchBackPressureRCAMetric; @@ -48,12 +50,12 @@ public class SearchBackPressureRCA extends OldGenRca taskJVMCancellationSlidingWindow; private final SlidingWindow shardJVMCancellationSlidingWindow; - private final SlidingWindow heapUsageSlidingWindow; + private final MinMaxOldGenSlidingWindow heapUsageSlidingWindow; // Sliding Window Interval private static final int SLIDING_WINDOW_SIZE_IN_MINS = 1; @@ -103,18 +105,18 @@ public SearchBackPressureRCA( this.rcaPeriod = rcaPeriod; this.clock = Clock.systemUTC(); this.searchbp_Stats = searchbp_Stats; - this.heapUsedIncreaseMaxThreshold = - SearchBackPressureRcaConfig.DEFAULT_MAX_HEAP_DOWNFLOW_THRESHOLD; + this.heapUsedIncreaseThreshold = + SearchBackPressureRcaConfig.DEFAULT_MAX_HEAP_INCREASE_THRESHOLD; this.heapCancellationIncreaseMaxThreshold = SearchBackPressureRcaConfig.DEFAULT_MAX_HEAP_CANCELLATION_THRESHOLD; - this.heapUsedDecreaseMinThreshold = - SearchBackPressureRcaConfig.DEFAULT_MIN_HEAP_OVERFLOW_THRESHOLD; - this.heapCancellationDecreaseMaxThreashold = + this.heapUsedDecreaseThreshold = + SearchBackPressureRcaConfig.DEFAULT_MAX_HEAP_DECREASE_THRESHOLD; + this.heapCancellationDecreaseMinThreashold = SearchBackPressureRcaConfig.DEFAULT_MIN_HEAP_CANCELLATION_THRESHOLD; // initialize sliding window this.heapUsageSlidingWindow = - new SlidingWindow<>(SLIDING_WINDOW_SIZE_IN_MINS, TimeUnit.MINUTES); + new MinMaxOldGenSlidingWindow(SLIDING_WINDOW_SIZE_IN_MINS, TimeUnit.MINUTES); this.shardJVMCancellationSlidingWindow = new SlidingWindow<>(SLIDING_WINDOW_SIZE_IN_MINS, TimeUnit.MINUTES); this.taskJVMCancellationSlidingWindow = @@ -167,6 +169,9 @@ public ResourceFlowUnit operate() { new SlidingWindowData(currTimeStamp, prevheapUsagePercentage)); } + // for testing + // heapUsageSlidingWindow.next(new SlidingWindowData(currTimeStamp, 80.3)); + double shardJVMCancellationPercentage = searchBackPressureRCAMetric.getShardJVMCancellationPercent(); if (!Double.isNaN(shardJVMCancellationPercentage)) { @@ -186,32 +191,49 @@ public ResourceFlowUnit operate() { if (counter == this.rcaPeriod) { ResourceContext context = null; LOG.info("SearchBackPressureRCA counter in rcaPeriod is {}", counter); + long currentTimeMillis = System.currentTimeMillis(); counter = 0; - // TODO change to - double maxHeapUsagePercentage = heapUsageSlidingWindow.readAvg(); + double maxHeapUsagePercentage = heapUsageSlidingWindow.readMax(); + double minHeapUsagePercentage = heapUsageSlidingWindow.readMin(); double avgShardJVMCancellationPercentage = shardJVMCancellationSlidingWindow.readAvg(); double avgTaskJVMCancellationPercentage = taskJVMCancellationSlidingWindow.readAvg(); + LOG.info( - "SearchBackPressureRCA: maxHeapUsagePercentage: {}, SearchBackPressureRCA: maxHeapUsagePercentage: {}, SearchBackPressureRCA: maxHeapUsagePercentage: {}", + "SearchBackPressureRCA: maxHeapUsagePercentage: {}, minHeapUsagePercentage: {}, SearchBackPressureRCA: avgShardJVMCancellationPercentage: {}, SearchBackPressureRCA: avgTaskJVMCancellationPercentage: {}", maxHeapUsagePercentage, + minHeapUsagePercentage, avgShardJVMCancellationPercentage, avgTaskJVMCancellationPercentage); + InstanceDetails instanceDetails = getInstanceDetails(); + HotNodeSummary nodeSummary = + new HotNodeSummary( + instanceDetails.getInstanceId(), instanceDetails.getInstanceIp()); // get the Configured Threshold and compare with Sliding Window Stats - if (maxHeapUsagePercentage > heapUsedDecreaseMinThreshold) { + /* + * 2 cases we send Unhealthy ResourceContext when we need to autotune the threshold + * - (increase) node max heap usage in last 60 secs is less than 70% and cancellationCountPercentage due to heap is more than 50% of all task cancellations + * - (decrease) node min heap usage in last 60 secs is more than 80% and cancellationCountPercetange due to heap is less than 30% of all task cancellations + */ + if ((maxHeapUsagePercentage < heapUsedIncreaseThreshold) + && (avgShardJVMCancellationPercentage > heapCancellationIncreaseMaxThreshold)) { // Generate a flow unit with an Unhealthy ResourceContext LOG.info( - "maxHeapUsagePercentage: {} is greater than threshold: {}", + "Condition 1 Meet, maxHeapUsagePercentage: {} is less than threshold: {}, avgShardJVMCancellationPercentage: {} is bigger than heapCancellationIncreaseMaxThreshold: {}", maxHeapUsagePercentage, - heapUsedDecreaseMinThreshold); - + heapUsedIncreaseThreshold, + avgShardJVMCancellationPercentage, + heapCancellationIncreaseMaxThreshold); + + context = new ResourceContext(Resources.State.UNHEALTHY); + return new ResourceFlowUnit<>( + currentTimeMillis, + context, + nodeSummary, + !instanceDetails.getIsClusterManager()); } else { - // Generate a flow unit with a Healthy ResourceContext - LOG.info( - "maxHeapUsagePercentage: {} is less than threshold: {}", - maxHeapUsagePercentage, - heapUsedDecreaseMinThreshold); + LOG.info("cindition 1 is not met."); } } else { From c84cf3435b870ee4844deb5d6c6032931015e66b Mon Sep 17 00:00:00 2001 From: CoderJeffrey Date: Wed, 28 Jun 2023 11:53:07 -0700 Subject: [PATCH 12/73] Rename SearchBackPressureClusterRCA and add it to AnalysisGraph (Signed-off-by: Jeffrey Liu ujeffliu@amazon.com) Signed-off-by: CoderJeffrey --- .../configs/SearchBackPressureRcaConfig.java | 12 +-- .../rca/store/OpenSearchAnalysisGraph.java | 19 ++++- ...java => SearchBackPressureClusterRCA.java} | 10 ++- .../SearchBackPressureRCA.java | 80 ++++++++++--------- 4 files changed, 71 insertions(+), 50 deletions(-) rename src/main/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/{SearchBackPressurClusterRCA.java => SearchBackPressureClusterRCA.java} (66%) diff --git a/src/main/java/org/opensearch/performanceanalyzer/rca/configs/SearchBackPressureRcaConfig.java b/src/main/java/org/opensearch/performanceanalyzer/rca/configs/SearchBackPressureRcaConfig.java index a71e74da8..7d6bce315 100644 --- a/src/main/java/org/opensearch/performanceanalyzer/rca/configs/SearchBackPressureRcaConfig.java +++ b/src/main/java/org/opensearch/performanceanalyzer/rca/configs/SearchBackPressureRcaConfig.java @@ -25,8 +25,8 @@ public class SearchBackPressureRcaConfig { // Decrease Threshold // node min heap usage in last 60 secs is more than 80% - public static final int DEFAULT_MAX_HEAP_DECREASE_THRESHOLD = 80; - private Integer maxHeapDecreasePercentageThreshold; + public static final int DEFAULT_MIN_HEAP_DECREASE_THRESHOLD = 80; + private Integer minHeapDecreasePercentageThreshold; // cancellationCount due to heap is less than 30% of all task cancellations public static final int DEFAULT_MIN_HEAP_CANCELLATION_THRESHOLD = 30; @@ -49,11 +49,11 @@ public SearchBackPressureRcaConfig(final RcaConf conf) { DEFAULT_MAX_HEAP_CANCELLATION_THRESHOLD, (s) -> s >= 0 && s <= 100, Integer.class); - maxHeapDecreasePercentageThreshold = + minHeapDecreasePercentageThreshold = conf.readRcaConfig( CONFIG_NAME, RCA_CONF_KEY_CONSTANTS.MAX_HEAP_USAGE_DECREASE_FIELD, - DEFAULT_MAX_HEAP_DECREASE_THRESHOLD, + DEFAULT_MIN_HEAP_DECREASE_THRESHOLD, (s) -> s >= 0 && s <= 100, Integer.class); minHeapCancellationPercentageThreshold = @@ -74,8 +74,8 @@ public int getMaxHeapCancellationPercentageThreshold() { return maxHeapCancellationPercentageThreshold; } - public int getMaxHeapDecreasePercentageThreshold() { - return maxHeapDecreasePercentageThreshold; + public int getMinHeapDecreasePercentageThreshold() { + return minHeapDecreasePercentageThreshold; } public int getMinHeapCancellationPercentageThreshold() { diff --git a/src/main/java/org/opensearch/performanceanalyzer/rca/store/OpenSearchAnalysisGraph.java b/src/main/java/org/opensearch/performanceanalyzer/rca/store/OpenSearchAnalysisGraph.java index bae2c74cf..967eded24 100644 --- a/src/main/java/org/opensearch/performanceanalyzer/rca/store/OpenSearchAnalysisGraph.java +++ b/src/main/java/org/opensearch/performanceanalyzer/rca/store/OpenSearchAnalysisGraph.java @@ -86,6 +86,7 @@ import org.opensearch.performanceanalyzer.rca.store.rca.jvmsizing.LargeHeapClusterRca; import org.opensearch.performanceanalyzer.rca.store.rca.jvmsizing.OldGenContendedRca; import org.opensearch.performanceanalyzer.rca.store.rca.jvmsizing.OldGenReclamationRca; +import org.opensearch.performanceanalyzer.rca.store.rca.searchbackpressure.SearchBackPressureClusterRCA; import org.opensearch.performanceanalyzer.rca.store.rca.searchbackpressure.SearchBackPressureRCA; import org.opensearch.performanceanalyzer.rca.store.rca.temperature.NodeTemperatureRca; import org.opensearch.performanceanalyzer.rca.store.rca.temperature.dimension.CpuUtilDimensionTemperatureRca; @@ -442,8 +443,8 @@ public void construct() { shardRequestCacheClusterRca, highHeapUsageClusterRca)); - // Search Back Pressure Service RCA - final SearchBackPressureRCA searchBackPressureRCA = + // Search Back Pressure Service RCA enabled + SearchBackPressureRCA searchBackPressureRCA = new SearchBackPressureRCA(RCA_PERIOD, heapMax, heapUsed, gcType, searchbp_Stats); searchBackPressureRCA.addTag( RcaConsts.RcaTagConstants.TAG_LOCUS, @@ -451,6 +452,20 @@ public void construct() { searchBackPressureRCA.addAllUpstreams( Arrays.asList(heapMax, heapUsed, gcType, searchbp_Stats)); + // Search Back Pressure Service Cluster RCA enabled + SearchBackPressureClusterRCA searchBackPressureClusterRCA = + new SearchBackPressureClusterRCA(RCA_PERIOD, searchBackPressureRCA); + searchBackPressureClusterRCA.addTag( + RcaConsts.RcaTagConstants.TAG_LOCUS, + RcaConsts.RcaTagConstants.LOCUS_CLUSTER_MANAGER_NODE); + searchBackPressureClusterRCA.addAllUpstreams( + Collections.singletonList(searchBackPressureRCA)); + searchBackPressureClusterRCA.addTag( + RcaConsts.RcaTagConstants.TAG_AGGREGATE_UPSTREAM, + RcaConsts.RcaTagConstants.LOCUS_DATA_NODE); + + // To Do SearchBackPressure Decider + AdmissionControlDecider admissionControlDecider = buildAdmissionControlDecider(heapUsed, heapMax); diff --git a/src/main/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressurClusterRCA.java b/src/main/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressureClusterRCA.java similarity index 66% rename from src/main/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressurClusterRCA.java rename to src/main/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressureClusterRCA.java index b97d9c0cd..2f2ea88a5 100644 --- a/src/main/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressurClusterRCA.java +++ b/src/main/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressureClusterRCA.java @@ -6,17 +6,21 @@ package org.opensearch.performanceanalyzer.rca.store.rca.searchbackpressure; +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; import org.opensearch.performanceanalyzer.rca.framework.api.Rca; import org.opensearch.performanceanalyzer.rca.framework.api.flow_units.ResourceFlowUnit; import org.opensearch.performanceanalyzer.rca.framework.api.summaries.HotNodeSummary; import org.opensearch.performanceanalyzer.rca.store.rca.cluster.BaseClusterRca; -public class SearchBackPressurClusterRCA extends BaseClusterRca { +public class SearchBackPressureClusterRCA extends BaseClusterRca { - public static final String RCA_TABLE_NAME = SearchBackPressurClusterRCA.class.getSimpleName(); + public static final String RCA_TABLE_NAME = SearchBackPressureClusterRCA.class.getSimpleName(); + private static final Logger LOG = LogManager.getLogger(SearchBackPressureClusterRCA.class); - public >> SearchBackPressurClusterRCA( + public >> SearchBackPressureClusterRCA( final int rcaPeriod, final R SearchBackPressureRCA) { super(rcaPeriod, SearchBackPressureRCA); + LOG.info("SearchBackPressureClusterRCA enabeld."); } } diff --git a/src/main/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressureRCA.java b/src/main/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressureRCA.java index 9624b2088..fa0dda859 100644 --- a/src/main/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressureRCA.java +++ b/src/main/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressureRCA.java @@ -47,8 +47,6 @@ public class SearchBackPressureRCA extends OldGenRca SearchBackPressureRCA( final int rcaPeriod, final M heapMax, final M heapUsed, M gcType, M searchbp_Stats) { super(EVAL_INTERVAL_IN_S, heapUsed, heapMax, null, gcType); @@ -110,7 +90,7 @@ public SearchBackPressureRCA( this.heapCancellationIncreaseMaxThreshold = SearchBackPressureRcaConfig.DEFAULT_MAX_HEAP_CANCELLATION_THRESHOLD; this.heapUsedDecreaseThreshold = - SearchBackPressureRcaConfig.DEFAULT_MAX_HEAP_DECREASE_THRESHOLD; + SearchBackPressureRcaConfig.DEFAULT_MIN_HEAP_DECREASE_THRESHOLD; this.heapCancellationDecreaseMinThreashold = SearchBackPressureRcaConfig.DEFAULT_MIN_HEAP_CANCELLATION_THRESHOLD; @@ -144,9 +124,11 @@ public void generateFlowUnitListFromWire(FlowUnitOperationArgWrapper args) { @Override public ResourceFlowUnit operate() { LOG.info("SearchBackPressureRCA operate() intiatilized"); - counter += 1; - long currTimeStamp = this.clock.millis(); + counter += 1; + ResourceContext context = null; + long currentTimeMillis = System.currentTimeMillis(); + ; // read key metrics into searchBackPressureRCAMetric for easier management SearchBackPressureRCAMetric searchBackPressureRCAMetric = getSearchBackPressureRCAMetric(); @@ -166,32 +148,33 @@ public ResourceFlowUnit operate() { double prevheapUsagePercentage = searchBackPressureRCAMetric.getHeapUsagePercent(); if (!Double.isNaN(prevheapUsagePercentage)) { heapUsageSlidingWindow.next( - new SlidingWindowData(currTimeStamp, prevheapUsagePercentage)); + new SlidingWindowData(currentTimeMillis, prevheapUsagePercentage)); } // for testing - // heapUsageSlidingWindow.next(new SlidingWindowData(currTimeStamp, 80.3)); + // heapUsageSlidingWindow.next(new SlidingWindowData(currentTimeMillis, 65.3)); double shardJVMCancellationPercentage = searchBackPressureRCAMetric.getShardJVMCancellationPercent(); if (!Double.isNaN(shardJVMCancellationPercentage)) { shardJVMCancellationSlidingWindow.next( - new SlidingWindowData(currTimeStamp, shardJVMCancellationPercentage)); + new SlidingWindowData(currentTimeMillis, shardJVMCancellationPercentage)); } double taskJVMCancellationPercentage = searchBackPressureRCAMetric.getTaskJVMCancellationPercent(); if (!Double.isNaN(taskJVMCancellationPercentage)) { taskJVMCancellationSlidingWindow.next( - new SlidingWindowData(currTimeStamp, taskJVMCancellationPercentage)); + new SlidingWindowData(currentTimeMillis, taskJVMCancellationPercentage)); } LOG.info("SearchBackPressureRCA counter is {}", counter); // if counter matches the rca period, emit the flow unit if (counter == this.rcaPeriod) { - ResourceContext context = null; LOG.info("SearchBackPressureRCA counter in rcaPeriod is {}", counter); - long currentTimeMillis = System.currentTimeMillis(); + currentTimeMillis = System.currentTimeMillis(); + + // reset counter counter = 0; double maxHeapUsagePercentage = heapUsageSlidingWindow.readMax(); @@ -216,11 +199,20 @@ public ResourceFlowUnit operate() { * - (increase) node max heap usage in last 60 secs is less than 70% and cancellationCountPercentage due to heap is more than 50% of all task cancellations * - (decrease) node min heap usage in last 60 secs is more than 80% and cancellationCountPercetange due to heap is less than 30% of all task cancellations */ - if ((maxHeapUsagePercentage < heapUsedIncreaseThreshold) - && (avgShardJVMCancellationPercentage > heapCancellationIncreaseMaxThreshold)) { + // avgShardJVMCancellationPercentage = 80.0; // testing + boolean increaseThresholdMet = + (maxHeapUsagePercentage < heapUsedIncreaseThreshold) + && (avgShardJVMCancellationPercentage + > heapCancellationIncreaseMaxThreshold); + boolean decreaseThresholdMet = + (minHeapUsagePercentage > heapUsedDecreaseThreshold) + && (avgShardJVMCancellationPercentage + < heapCancellationDecreaseMinThreashold); + + if (increaseThresholdMet || decreaseThresholdMet) { // Generate a flow unit with an Unhealthy ResourceContext LOG.info( - "Condition 1 Meet, maxHeapUsagePercentage: {} is less than threshold: {}, avgShardJVMCancellationPercentage: {} is bigger than heapCancellationIncreaseMaxThreshold: {}", + "Increase/Decrease Condition Meet, maxHeapUsagePercentage: {} is less than threshold: {}, avgShardJVMCancellationPercentage: {} is bigger than heapCancellationIncreaseMaxThreshold: {}", maxHeapUsagePercentage, heapUsedIncreaseThreshold, avgShardJVMCancellationPercentage, @@ -233,16 +225,20 @@ public ResourceFlowUnit operate() { nodeSummary, !instanceDetails.getIsClusterManager()); } else { - LOG.info("cindition 1 is not met."); + // if autotune is not triggered, return healthy state + context = new ResourceContext(Resources.State.HEALTHY); + return new ResourceFlowUnit<>( + currentTimeMillis, + context, + nodeSummary, + !instanceDetails.getIsClusterManager()); } - } else { - LOG.info("Empty FlowUnit returned for High Heap Usage RCA"); - return new ResourceFlowUnit<>(this.clock.millis()); + // return healthy state when the counter does not meet rcaPeriod + LOG.info("Empty Healthy FlowUnit returned for SearchbackPressureRCA"); + currentTimeMillis = System.currentTimeMillis(); + return new ResourceFlowUnit<>(currentTimeMillis); } - - LOG.info("SearchBackPressureRCA operate() finished"); - return null; } private SearchBackPressureRCAMetric getSearchBackPressureRCAMetric() { @@ -322,5 +318,11 @@ public void readRcaConf(RcaConf conf) { final SearchBackPressureRcaConfig config = conf.getSearchBackPressureRcaConfig(); // read anything from config file in runtime // if not just skip it + this.heapUsedIncreaseThreshold = config.getMaxHeapIncreasePercentageThreshold(); + this.heapCancellationIncreaseMaxThreshold = + config.getMaxHeapCancellationPercentageThreshold(); + this.heapUsedDecreaseThreshold = config.getMinHeapDecreasePercentageThreshold(); + this.heapCancellationDecreaseMinThreashold = + config.getMinHeapCancellationPercentageThreshold(); } } From 08f69270eba609c0e5e52cafd6af1585b9659cf6 Mon Sep 17 00:00:00 2001 From: CoderJeffrey Date: Wed, 28 Jun 2023 16:59:08 -0700 Subject: [PATCH 13/73] Add basic UTs for SearchBackPressureRCA cluster/node level (Signed-off-by: Jeffrey Liu ujeffliu@amazon.com) Signed-off-by: CoderJeffrey --- .../SearchBackPressureRcaTest.java | 217 ++++++++++++++++++ 1 file changed, 217 insertions(+) create mode 100644 src/test/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressureRcaTest.java diff --git a/src/test/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressureRcaTest.java b/src/test/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressureRcaTest.java new file mode 100644 index 000000000..9ebde61ed --- /dev/null +++ b/src/test/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressureRcaTest.java @@ -0,0 +1,217 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ + +package org.opensearch.performanceanalyzer.rca.store.rca.searchbackpressure; + +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertTrue; +import static org.mockito.Mockito.when; +import static org.mockito.MockitoAnnotations.initMocks; + +import java.util.Arrays; +import java.util.Collections; +import java.util.List; +import java.util.stream.IntStream; +import org.junit.Before; +import org.junit.Test; +import org.mockito.Mock; +import org.opensearch.performanceanalyzer.commons.metrics.AllMetrics; +import org.opensearch.performanceanalyzer.metricsdb.MetricsDB; +import org.opensearch.performanceanalyzer.rca.framework.api.Metric; +import org.opensearch.performanceanalyzer.rca.framework.api.contexts.ResourceContext; +import org.opensearch.performanceanalyzer.rca.framework.api.flow_units.MetricFlowUnit; +import org.opensearch.performanceanalyzer.rca.framework.api.flow_units.ResourceFlowUnit; +import org.opensearch.performanceanalyzer.rca.framework.api.metrics.MetricTestHelper; +import org.opensearch.performanceanalyzer.rca.framework.api.summaries.HotNodeSummary; + +public class SearchBackPressureRcaTest { + // Mock Metrics + @Mock private Metric mockHeapUsed; + + @Mock private Metric mockHeapMax; + + @Mock private Metric mockGcType; + + @Mock private Metric mockSearchbpStats; + + // every 5s operate() gets initiated + private static final int RCA_PERIOD = 5; + + private SearchBackPressureRCA testRca; + private MetricTestHelper metricTestHelper; + private static final double DEFAULT_MAX_HEAP_SIZE = 4294967296.0; + + // mock heap metric columns + private final List heapTableColumns = + Arrays.asList( + AllMetrics.HeapDimension.MEM_TYPE.toString(), + MetricsDB.SUM, + MetricsDB.AVG, + MetricsDB.MIN, + MetricsDB.MAX); + + // mock search back pressure metric columns + private final List searchbpTableColumns = + Arrays.asList( + AllMetrics.SearchBackPressureStatsValue.SEARCHBP_TYPE_DIM.toString(), + MetricsDB.SUM, + MetricsDB.AVG, + MetricsDB.MIN, + MetricsDB.MAX); + + // dummy field to create a mock gcType Metric + private static final String CMS_COLLECTOR = "ConcurrentMarkSweep"; + + /* + * initialization before running any test + * + */ + @Before + public void setup() throws Exception { + initMocks(this); + this.metricTestHelper = new MetricTestHelper(RCA_PERIOD); + setupMockHeapMetric(mockHeapUsed, 80.0); + setupMockHeapMetric(mockHeapMax, 100.0); + // gcType is required for constructor of SearchBackPressureRCA but the exact type of gcType + // does not matter + setupMockGcType(CMS_COLLECTOR); + + // set up SearchBp_Stats table + setupMockSearchbpStats(mockSearchbpStats, 10.0, 10.0, 8.0, 7.0); + + this.testRca = + new SearchBackPressureRCA( + RCA_PERIOD, mockHeapMax, mockHeapUsed, mockGcType, mockSearchbpStats); + } + + @Test + public void testSearchBackpressureGetResourceContextGeneral() { + setupMockHeapMetric(mockHeapMax, DEFAULT_MAX_HEAP_SIZE); + setupMockHeapMetric(mockHeapUsed, DEFAULT_MAX_HEAP_SIZE * 0.8); + setupMockSearchbpStats(mockSearchbpStats, 10.0, 10.0, 8.0, 7.0); + System.out.println("testAdmissionControlRcaSmallMaxHeap started"); + IntStream.range(0, RCA_PERIOD - 1).forEach(i -> testRca.operate()); + + ResourceFlowUnit flowUnit = testRca.operate(); + + assertFalse(flowUnit.isEmpty()); + ResourceContext context = flowUnit.getResourceContext(); + assertTrue(context.isHealthy()); + } + + private void setupMockHeapMetric(final Metric metric, final double val) { + String valString = Double.toString(val); + List data = + Arrays.asList( + AllMetrics.GCType.OLD_GEN.toString(), + valString, + valString, + valString, + valString); + when(metric.getFlowUnits()) + .thenReturn( + Collections.singletonList( + new MetricFlowUnit( + 0, + metricTestHelper.createTestResult( + heapTableColumns, data)))); + } + + private void setupMockGcType(final String collector) { + List gcInfoTableColumns = + Arrays.asList( + AllMetrics.GCInfoDimension.MEMORY_POOL.toString(), + AllMetrics.GCInfoDimension.COLLECTOR_NAME.toString()); + List data = Arrays.asList(AllMetrics.GCType.OLD_GEN.toString(), collector); + when(mockGcType.getFlowUnits()) + .thenReturn( + Collections.singletonList( + new MetricFlowUnit( + 0, + metricTestHelper.createTestResult( + gcInfoTableColumns, data)))); + } + + private void setupMockSearchbpStats( + final Metric metric, + final double searchbpShardCancellationCount, + final double searchbpTaskCancellationCount, + final double searchbpJVMShardCancellationCount, + final double searchbpJVMTaskCancellationCount) { + String searchbpShardCancellationCountStr = Double.toString(searchbpShardCancellationCount); + String searchbpTaskCancellationCountStr = Double.toString(searchbpTaskCancellationCount); + String searchbpJVMShardCancellationCountStr = + Double.toString(searchbpJVMShardCancellationCount); + String searchbpJVMTaskCancellationCountStr = + Double.toString(searchbpJVMTaskCancellationCount); + + // add searchbpShardCancellationCountStr row + List searchbpShardCancellationCountRow = + Arrays.asList( + AllMetrics.SearchBackPressureStatsValue + .SEARCHBP_SHARD_STATS_CANCELLATIONCOUNT + .toString(), + searchbpShardCancellationCountStr, + searchbpShardCancellationCountStr, + searchbpShardCancellationCountStr, + searchbpShardCancellationCountStr); + + // add searchbpTaskCancellationCountStr row + List searchbpTaskCancellationCountRow = + Arrays.asList( + AllMetrics.SearchBackPressureStatsValue + .SEARCHBP_TASK_STATS_CANCELLATIONCOUNT + .toString(), + searchbpTaskCancellationCountStr, + searchbpTaskCancellationCountStr, + searchbpTaskCancellationCountStr, + searchbpTaskCancellationCountStr); + + // add searchbpJVMShardCancellationCountStr row + List searchbpJVMShardCancellationCountRow = + Arrays.asList( + AllMetrics.SearchBackPressureStatsValue + .SEARCHBP_SHARD_STATS_RESOURCE_HEAP_USAGE_CANCELLATIONCOUNT + .toString(), + searchbpJVMShardCancellationCountStr, + searchbpJVMShardCancellationCountStr, + searchbpJVMShardCancellationCountStr, + searchbpJVMShardCancellationCountStr); + + // add searchbpJVMTaskCancellationCountStr row + List searchbpJVMTaskCancellationCountRow = + Arrays.asList( + AllMetrics.SearchBackPressureStatsValue + .SEARCHBP_TASK_STATS_RESOURCE_HEAP_USAGE_CANCELLATIONCOUNT + .toString(), + searchbpJVMTaskCancellationCountStr, + searchbpJVMTaskCancellationCountStr, + searchbpJVMTaskCancellationCountStr, + searchbpJVMTaskCancellationCountStr); + + List flowUnits = + Arrays.asList( + new MetricFlowUnit( + 0, + metricTestHelper.createTestResult( + searchbpTableColumns, searchbpShardCancellationCountRow)), + new MetricFlowUnit( + 0, + metricTestHelper.createTestResult( + searchbpTableColumns, searchbpTaskCancellationCountRow)), + new MetricFlowUnit( + 0, + metricTestHelper.createTestResult( + searchbpTableColumns, + searchbpJVMShardCancellationCountRow)), + new MetricFlowUnit( + 0, + metricTestHelper.createTestResult( + searchbpTableColumns, + searchbpJVMTaskCancellationCountRow))); + + when(metric.getFlowUnits()).thenReturn(flowUnits); + } +} From 31e8b49ac9f0c873b45bd8bbd193cedeed60121d Mon Sep 17 00:00:00 2001 From: CoderJeffrey Date: Wed, 28 Jun 2023 17:17:02 -0700 Subject: [PATCH 14/73] Add unhealthy/healthy stats UTs for SearchBackPressureRCA cluster/node level (Signed-off-by: Jeffrey Liu ujeffliu@amazon.com) Signed-off-by: CoderJeffrey --- .../SearchBackPressureRcaTest.java | 79 +++++++++++++++++-- 1 file changed, 74 insertions(+), 5 deletions(-) diff --git a/src/test/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressureRcaTest.java b/src/test/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressureRcaTest.java index 9ebde61ed..cf8f1c759 100644 --- a/src/test/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressureRcaTest.java +++ b/src/test/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressureRcaTest.java @@ -20,7 +20,6 @@ import org.opensearch.performanceanalyzer.commons.metrics.AllMetrics; import org.opensearch.performanceanalyzer.metricsdb.MetricsDB; import org.opensearch.performanceanalyzer.rca.framework.api.Metric; -import org.opensearch.performanceanalyzer.rca.framework.api.contexts.ResourceContext; import org.opensearch.performanceanalyzer.rca.framework.api.flow_units.MetricFlowUnit; import org.opensearch.performanceanalyzer.rca.framework.api.flow_units.ResourceFlowUnit; import org.opensearch.performanceanalyzer.rca.framework.api.metrics.MetricTestHelper; @@ -86,8 +85,27 @@ public void setup() throws Exception { RCA_PERIOD, mockHeapMax, mockHeapUsed, mockGcType, mockSearchbpStats); } + /* + * Test SearchBackPressure RCA returns empty resourceFlowUnit if counter is less than the rcaPeriod + */ @Test - public void testSearchBackpressureGetResourceContextGeneral() { + public void testSearchBpGetResourceContextLessRcaPeriod() { + setupMockHeapMetric(mockHeapMax, DEFAULT_MAX_HEAP_SIZE); + setupMockHeapMetric(mockHeapUsed, DEFAULT_MAX_HEAP_SIZE * 0.8); + setupMockSearchbpStats(mockSearchbpStats, 10.0, 10.0, 8.0, 7.0); + + ResourceFlowUnit flowUnit = testRca.operate(); + + // counter = 1 + // counter needs to equal to RcaPeriod (5 in this case) to get nonempty resourceflowunit + assertTrue(flowUnit.isEmpty()); + } + + /* + * Test SearchBackPressure RCA returns nonempty resourceFlowUnit if counter equals to rcaPeriod + */ + @Test + public void testSearchBpGetResourceContextEqualRcaPeriod() { setupMockHeapMetric(mockHeapMax, DEFAULT_MAX_HEAP_SIZE); setupMockHeapMetric(mockHeapUsed, DEFAULT_MAX_HEAP_SIZE * 0.8); setupMockSearchbpStats(mockSearchbpStats, 10.0, 10.0, 8.0, 7.0); @@ -96,9 +114,60 @@ public void testSearchBackpressureGetResourceContextGeneral() { ResourceFlowUnit flowUnit = testRca.operate(); + // counter = RCA_PERIOD + // counter needs to equal to RcaPeriod (5 in this case) to get nonempty resourceflowunit + assertFalse(flowUnit.isEmpty()); + } + + /* + * Test SearchBackPressure RCA returns healthy nonempty flow units if the settings does not trigger autotune + */ + // @Test + // public void testSearchBpGetHealthyFlowUnit() { + // setupMockHeapMetric(mockHeapMax, DEFAULT_MAX_HEAP_SIZE); + // setupMockHeapMetric(mockHeapUsed, DEFAULT_MAX_HEAP_SIZE * 0.8); + // setupMockSearchbpStats(mockSearchbpStats, 10.0, 10.0, 8.0, 7.0); + // System.out.println("testAdmissionControlRcaSmallMaxHeap started"); + // IntStream.range(0, RCA_PERIOD - 1).forEach(i -> testRca.operate()); + + // ResourceFlowUnit flowUnit = testRca.operate(); + // assertFalse(flowUnit.isEmpty()); + // } + + /* + * Test SearchBackPressure RCA returns unhealthy nonempty flow units if the settings does trigger autotune by increasing threshold + * Increasing threshold: + * node max heap usage in last 60 secs is less than 70% + * cancellationCount due to heap is more than 50% of all task cancellations. + */ + @Test + public void testSearchBpGetUnHealthyFlowUnitByIncreaseThreshold() { + setupMockHeapMetric(mockHeapMax, DEFAULT_MAX_HEAP_SIZE); + setupMockHeapMetric(mockHeapUsed, DEFAULT_MAX_HEAP_SIZE * 0.3); + setupMockSearchbpStats(mockSearchbpStats, 10.0, 10.0, 8.0, 7.0); + IntStream.range(0, RCA_PERIOD - 1).forEach(i -> testRca.operate()); + + ResourceFlowUnit flowUnit = testRca.operate(); + assertFalse(flowUnit.isEmpty()); + assertFalse(flowUnit.getResourceContext().isHealthy()); + } + + /* + * Test SearchBackPressure RCA returns unhealthy nonempty flow units if the settings does trigger autotune by decreasing threshold + * decreasing threshold: + * node min heap usage in last 60 secs is more than 80% + * cancellationCount due to heap is less than 30% of all task cancellations + */ + @Test + public void testSearchBpGetUnHealthyFlowUnitByDecreaseThreshold() { + setupMockHeapMetric(mockHeapMax, DEFAULT_MAX_HEAP_SIZE); + setupMockHeapMetric(mockHeapUsed, DEFAULT_MAX_HEAP_SIZE * 0.9); + setupMockSearchbpStats(mockSearchbpStats, 10.0, 10.0, 2.0, 2.0); + IntStream.range(0, RCA_PERIOD - 1).forEach(i -> testRca.operate()); + + ResourceFlowUnit flowUnit = testRca.operate(); assertFalse(flowUnit.isEmpty()); - ResourceContext context = flowUnit.getResourceContext(); - assertTrue(context.isHealthy()); + assertFalse(flowUnit.getResourceContext().isHealthy()); } private void setupMockHeapMetric(final Metric metric, final double val) { @@ -212,6 +281,6 @@ private void setupMockSearchbpStats( searchbpTableColumns, searchbpJVMTaskCancellationCountRow))); - when(metric.getFlowUnits()).thenReturn(flowUnits); + when(metric.getFlowUnits()).thenReturn(flowUnits); } } From 4bfa1b2c9d4c117821617e8a6acc9987d8b3fbb8 Mon Sep 17 00:00:00 2001 From: CoderJeffrey Date: Thu, 29 Jun 2023 09:27:22 -0700 Subject: [PATCH 15/73] Add healthy resource unit UT (Signed-off-by: Jeffrey Liu ujeffliu@amazon.com) Signed-off-by: CoderJeffrey --- .../SearchBackPressureRcaTest.java | 24 +++++++++---------- 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/src/test/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressureRcaTest.java b/src/test/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressureRcaTest.java index cf8f1c759..2f38bb2e4 100644 --- a/src/test/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressureRcaTest.java +++ b/src/test/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressureRcaTest.java @@ -109,7 +109,6 @@ public void testSearchBpGetResourceContextEqualRcaPeriod() { setupMockHeapMetric(mockHeapMax, DEFAULT_MAX_HEAP_SIZE); setupMockHeapMetric(mockHeapUsed, DEFAULT_MAX_HEAP_SIZE * 0.8); setupMockSearchbpStats(mockSearchbpStats, 10.0, 10.0, 8.0, 7.0); - System.out.println("testAdmissionControlRcaSmallMaxHeap started"); IntStream.range(0, RCA_PERIOD - 1).forEach(i -> testRca.operate()); ResourceFlowUnit flowUnit = testRca.operate(); @@ -121,18 +120,19 @@ public void testSearchBpGetResourceContextEqualRcaPeriod() { /* * Test SearchBackPressure RCA returns healthy nonempty flow units if the settings does not trigger autotune + * Meeting None of Increasing or Decreasing Threshold */ - // @Test - // public void testSearchBpGetHealthyFlowUnit() { - // setupMockHeapMetric(mockHeapMax, DEFAULT_MAX_HEAP_SIZE); - // setupMockHeapMetric(mockHeapUsed, DEFAULT_MAX_HEAP_SIZE * 0.8); - // setupMockSearchbpStats(mockSearchbpStats, 10.0, 10.0, 8.0, 7.0); - // System.out.println("testAdmissionControlRcaSmallMaxHeap started"); - // IntStream.range(0, RCA_PERIOD - 1).forEach(i -> testRca.operate()); - - // ResourceFlowUnit flowUnit = testRca.operate(); - // assertFalse(flowUnit.isEmpty()); - // } + @Test + public void testSearchBpGetHealthyFlowUnit() { + setupMockHeapMetric(mockHeapMax, DEFAULT_MAX_HEAP_SIZE); + setupMockHeapMetric(mockHeapUsed, DEFAULT_MAX_HEAP_SIZE * 0.8); + setupMockSearchbpStats(mockSearchbpStats, 10.0, 10.0, 8.0, 7.0); + IntStream.range(0, RCA_PERIOD - 1).forEach(i -> testRca.operate()); + + ResourceFlowUnit flowUnit = testRca.operate(); + assertFalse(flowUnit.isEmpty()); + assertTrue(flowUnit.getResourceContext().isHealthy()); + } /* * Test SearchBackPressure RCA returns unhealthy nonempty flow units if the settings does trigger autotune by increasing threshold From 13e2d48e503eb87bf9f7f7656221a35a47172f7c Mon Sep 17 00:00:00 2001 From: CoderJeffrey Date: Thu, 29 Jun 2023 13:09:49 -0700 Subject: [PATCH 16/73] Add UT s both shard/task level (Signed-off-by: Jeffrey Liu ujeffliu@amazon.com) Signed-off-by: CoderJeffrey --- .../configs/SearchBackPressureRcaConfig.java | 82 +++++++++----- .../SearchBackPressureRCA.java | 100 ++++++++++++++---- .../SearchBackPressureRcaTest.java | 50 +++++++-- 3 files changed, 180 insertions(+), 52 deletions(-) diff --git a/src/main/java/org/opensearch/performanceanalyzer/rca/configs/SearchBackPressureRcaConfig.java b/src/main/java/org/opensearch/performanceanalyzer/rca/configs/SearchBackPressureRcaConfig.java index 7d6bce315..9a6512893 100644 --- a/src/main/java/org/opensearch/performanceanalyzer/rca/configs/SearchBackPressureRcaConfig.java +++ b/src/main/java/org/opensearch/performanceanalyzer/rca/configs/SearchBackPressureRcaConfig.java @@ -14,23 +14,31 @@ public class SearchBackPressureRcaConfig { // Interval period in seconds public static final long DEFAULT_EVALUATION_INTERVAL_IN_S = 60; - // Increase Threshold + /* Increase Threshold */ // node max heap usage in last 60 secs is less than 70% public static final int DEFAULT_MAX_HEAP_INCREASE_THRESHOLD = 70; private Integer maxHeapIncreasePercentageThreshold; - // cancellationCount due to heap is more than 50% of all task cancellations. - public static final int DEFAULT_MAX_HEAP_CANCELLATION_THRESHOLD = 50; - private Integer maxHeapCancellationPercentageThreshold; + // cancellationCount due to heap is more than 50% of all task cancellations in shard level + public static final int DEFAULT_SHARD_MAX_HEAP_CANCELLATION_THRESHOLD = 50; + private Integer maxShardHeapCancellationPercentageThreshold; - // Decrease Threshold + // cancellationCount due to heap is more than 50% of all task cancellations in task level + public static final int DEFAULT_TASK_MAX_HEAP_CANCELLATION_THRESHOLD = 50; + private Integer maxTaskHeapCancellationPercentageThreshold; + + /* Decrease Threshold */ // node min heap usage in last 60 secs is more than 80% public static final int DEFAULT_MIN_HEAP_DECREASE_THRESHOLD = 80; private Integer minHeapDecreasePercentageThreshold; - // cancellationCount due to heap is less than 30% of all task cancellations - public static final int DEFAULT_MIN_HEAP_CANCELLATION_THRESHOLD = 30; - private Integer minHeapCancellationPercentageThreshold; + // cancellationCount due to heap is less than 30% of all task cancellations in shard level + public static final int DEFAULT_SHARD_MIN_HEAP_CANCELLATION_THRESHOLD = 30; + private Integer minShardHeapCancellationPercentageThreshold; + + // cancellationCount due to heap is less than 30% of all task cancellations in task level + public static final int DEFAULT_TASK_MIN_HEAP_CANCELLATION_THRESHOLD = 30; + private Integer minTaskHeapCancellationPercentageThreshold; public SearchBackPressureRcaConfig(final RcaConf conf) { // (s) -> s > 0 is the validator, if validated, fields from conf file will be returned, @@ -42,11 +50,18 @@ public SearchBackPressureRcaConfig(final RcaConf conf) { DEFAULT_MAX_HEAP_INCREASE_THRESHOLD, (s) -> s >= 0 && s <= 100, Integer.class); - maxHeapCancellationPercentageThreshold = + maxShardHeapCancellationPercentageThreshold = conf.readRcaConfig( CONFIG_NAME, - RCA_CONF_KEY_CONSTANTS.MAX_HEAP_CANCELLATION_PERCENTAGE_FIELD, - DEFAULT_MAX_HEAP_CANCELLATION_THRESHOLD, + RCA_CONF_KEY_CONSTANTS.MAX_SHARD_HEAP_CANCELLATION_PERCENTAGE_FIELD, + DEFAULT_SHARD_MAX_HEAP_CANCELLATION_THRESHOLD, + (s) -> s >= 0 && s <= 100, + Integer.class); + maxTaskHeapCancellationPercentageThreshold = + conf.readRcaConfig( + CONFIG_NAME, + RCA_CONF_KEY_CONSTANTS.MAX_TASK_HEAP_CANCELLATION_PERCENTAGE_FIELD, + DEFAULT_TASK_MAX_HEAP_CANCELLATION_THRESHOLD, (s) -> s >= 0 && s <= 100, Integer.class); minHeapDecreasePercentageThreshold = @@ -56,39 +71,58 @@ public SearchBackPressureRcaConfig(final RcaConf conf) { DEFAULT_MIN_HEAP_DECREASE_THRESHOLD, (s) -> s >= 0 && s <= 100, Integer.class); - minHeapCancellationPercentageThreshold = + minShardHeapCancellationPercentageThreshold = + conf.readRcaConfig( + CONFIG_NAME, + RCA_CONF_KEY_CONSTANTS.MIN_SHARD_HEAP_CANCELLATION_PERCENTAGE_FIELD, + DEFAULT_SHARD_MIN_HEAP_CANCELLATION_THRESHOLD, + (s) -> s >= 0 && s <= 100, + Integer.class); + minTaskHeapCancellationPercentageThreshold = conf.readRcaConfig( CONFIG_NAME, - RCA_CONF_KEY_CONSTANTS.MIN_HEAP_CANCELLATION_PERCENTAGE_FIELD, - DEFAULT_MIN_HEAP_CANCELLATION_THRESHOLD, + RCA_CONF_KEY_CONSTANTS.MIN_TASK_HEAP_CANCELLATION_PERCENTAGE_FIELD, + DEFAULT_TASK_MIN_HEAP_CANCELLATION_THRESHOLD, (s) -> s >= 0 && s <= 100, Integer.class); } // Getters for private field - public int getMaxHeapIncreasePercentageThreshold() { + public Integer getMaxHeapIncreasePercentageThreshold() { return maxHeapIncreasePercentageThreshold; } - public int getMaxHeapCancellationPercentageThreshold() { - return maxHeapCancellationPercentageThreshold; + public Integer getMaxShardHeapCancellationPercentageThreshold() { + return maxShardHeapCancellationPercentageThreshold; } - public int getMinHeapDecreasePercentageThreshold() { + public Integer getMaxTaskHeapCancellationPercentageThreshold() { + return maxTaskHeapCancellationPercentageThreshold; + } + + public Integer getMinHeapDecreasePercentageThreshold() { return minHeapDecreasePercentageThreshold; } - public int getMinHeapCancellationPercentageThreshold() { - return minHeapCancellationPercentageThreshold; + public Integer getMinShardHeapCancellationPercentageThreshold() { + return minShardHeapCancellationPercentageThreshold; + } + + public Integer getMinTaskHeapCancellationPercentageThreshold() { + return minTaskHeapCancellationPercentageThreshold; } // name for the configuration field public static class RCA_CONF_KEY_CONSTANTS { public static final String MAX_HEAP_USAGE_INCREASE_FIELD = "max-heap-usage-increase"; - public static final String MAX_HEAP_CANCELLATION_PERCENTAGE_FIELD = - "max-heap-cancellation-percentage"; + public static final String MAX_SHARD_HEAP_CANCELLATION_PERCENTAGE_FIELD = + "max-shard-heap-cancellation-percentage"; + public static final String MAX_TASK_HEAP_CANCELLATION_PERCENTAGE_FIELD = + "max-task-heap-cancellation-percentage"; public static final String MAX_HEAP_USAGE_DECREASE_FIELD = "max-heap-usage-decrease"; - public static final String MIN_HEAP_CANCELLATION_PERCENTAGE_FIELD = - "min-heap-cancellation-percentage"; + public static final String MIN_SHARD_HEAP_CANCELLATION_PERCENTAGE_FIELD = + "min-shard-heap-cancellation-percentage"; + public static final String MIN_TASK_HEAP_CANCELLATION_PERCENTAGE_FIELD = + "min-task-heap-cancellation-percentage"; } } diff --git a/src/main/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressureRCA.java b/src/main/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressureRCA.java index fa0dda859..aabbd1fc2 100644 --- a/src/main/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressureRCA.java +++ b/src/main/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressureRCA.java @@ -44,16 +44,24 @@ public class SearchBackPressureRCA extends OldGenRca SearchBackPressureRCA( this.searchbp_Stats = searchbp_Stats; this.heapUsedIncreaseThreshold = SearchBackPressureRcaConfig.DEFAULT_MAX_HEAP_INCREASE_THRESHOLD; - this.heapCancellationIncreaseMaxThreshold = - SearchBackPressureRcaConfig.DEFAULT_MAX_HEAP_CANCELLATION_THRESHOLD; + this.heapShardCancellationIncreaseMaxThreshold = + SearchBackPressureRcaConfig.DEFAULT_SHARD_MAX_HEAP_CANCELLATION_THRESHOLD; + this.heapTaskCancellationIncreaseMaxThreshold = + SearchBackPressureRcaConfig.DEFAULT_TASK_MAX_HEAP_CANCELLATION_THRESHOLD; this.heapUsedDecreaseThreshold = SearchBackPressureRcaConfig.DEFAULT_MIN_HEAP_DECREASE_THRESHOLD; - this.heapCancellationDecreaseMinThreashold = - SearchBackPressureRcaConfig.DEFAULT_MIN_HEAP_CANCELLATION_THRESHOLD; + this.heapShardCancellationDecreaseMinThreashold = + SearchBackPressureRcaConfig.DEFAULT_SHARD_MIN_HEAP_CANCELLATION_THRESHOLD; + this.heapTaskCancellationDecreaseMinThreashold = + SearchBackPressureRcaConfig.DEFAULT_TASK_MIN_HEAP_CANCELLATION_THRESHOLD; // initialize sliding window this.heapUsageSlidingWindow = @@ -200,25 +212,66 @@ public ResourceFlowUnit operate() { * - (decrease) node min heap usage in last 60 secs is more than 80% and cancellationCountPercetange due to heap is less than 30% of all task cancellations */ // avgShardJVMCancellationPercentage = 80.0; // testing - boolean increaseThresholdMet = + + // TODO: add Task CancellationCountPercentage as another criteria + // TODO + /* + * HotResourceSummary resourceSummary = + new HotResourceSummary(HEAP_MAX_SIZE, currentThreshold, previousThreshold, 0); + nodeSummary.appendNestedSummary(resourceSummary); + + If you + */ + boolean increaseThresholdMetByShard = (maxHeapUsagePercentage < heapUsedIncreaseThreshold) && (avgShardJVMCancellationPercentage - > heapCancellationIncreaseMaxThreshold); - boolean decreaseThresholdMet = + > heapShardCancellationIncreaseMaxThreshold); + boolean decreaseThresholdMetByShard = (minHeapUsagePercentage > heapUsedDecreaseThreshold) && (avgShardJVMCancellationPercentage - < heapCancellationDecreaseMinThreashold); + < heapShardCancellationDecreaseMinThreashold); + + boolean increaseThresholdMetByTask = + (maxHeapUsagePercentage < heapUsedIncreaseThreshold) + && (avgTaskJVMCancellationPercentage + > heapTaskCancellationIncreaseMaxThreshold); + boolean decreaseThresholdMetByTask = + (minHeapUsagePercentage > heapUsedDecreaseThreshold) + && (avgTaskJVMCancellationPercentage + < heapTaskCancellationDecreaseMinThreashold); + + // HotResourceSummary resourceSummary = + // new HotResourceSummary(HEAP_MAX_SIZE, currentThreshold, + // previousThreshold, 0); + // nodeSummary.appendNestedSummary(resourceSummary); - if (increaseThresholdMet || decreaseThresholdMet) { + if (increaseThresholdMetByShard || decreaseThresholdMetByShard) { // Generate a flow unit with an Unhealthy ResourceContext LOG.info( - "Increase/Decrease Condition Meet, maxHeapUsagePercentage: {} is less than threshold: {}, avgShardJVMCancellationPercentage: {} is bigger than heapCancellationIncreaseMaxThreshold: {}", + "Increase/Decrease Condition Meet for Shard, maxHeapUsagePercentage: {} is less than threshold: {}, avgShardJVMCancellationPercentage: {} is bigger than heapShardCancellationIncreaseMaxThreshold: {}", maxHeapUsagePercentage, heapUsedIncreaseThreshold, avgShardJVMCancellationPercentage, - heapCancellationIncreaseMaxThreshold); + heapShardCancellationIncreaseMaxThreshold); context = new ResourceContext(Resources.State.UNHEALTHY); + // add an additional resource with metadata: shard-level + return new ResourceFlowUnit<>( + currentTimeMillis, + context, + nodeSummary, + !instanceDetails.getIsClusterManager()); + } else if (increaseThresholdMetByTask || decreaseThresholdMetByTask) { + // Generate a flow unit with an Unhealthy ResourceContext + LOG.info( + "Increase/Decrease Condition Meet for Task, maxHeapUsagePercentage: {} is less than threshold: {}, avgShardJVMCancellationPercentage: {} is bigger than heapShardCancellationIncreaseMaxThreshold: {}", + maxHeapUsagePercentage, + heapUsedIncreaseThreshold, + avgTaskJVMCancellationPercentage, + heapTaskCancellationIncreaseMaxThreshold); + + context = new ResourceContext(Resources.State.UNHEALTHY); + // add an additional resource with metadata: task-level return new ResourceFlowUnit<>( currentTimeMillis, context, @@ -233,6 +286,7 @@ public ResourceFlowUnit operate() { nodeSummary, !instanceDetails.getIsClusterManager()); } + } else { // return healthy state when the counter does not meet rcaPeriod LOG.info("Empty Healthy FlowUnit returned for SearchbackPressureRCA"); @@ -319,10 +373,14 @@ public void readRcaConf(RcaConf conf) { // read anything from config file in runtime // if not just skip it this.heapUsedIncreaseThreshold = config.getMaxHeapIncreasePercentageThreshold(); - this.heapCancellationIncreaseMaxThreshold = - config.getMaxHeapCancellationPercentageThreshold(); + this.heapShardCancellationIncreaseMaxThreshold = + config.getMaxShardHeapCancellationPercentageThreshold(); + this.heapTaskCancellationIncreaseMaxThreshold = + config.getMaxTaskHeapCancellationPercentageThreshold(); this.heapUsedDecreaseThreshold = config.getMinHeapDecreasePercentageThreshold(); - this.heapCancellationDecreaseMinThreashold = - config.getMinHeapCancellationPercentageThreshold(); + this.heapShardCancellationDecreaseMinThreashold = + config.getMinShardHeapCancellationPercentageThreshold(); + this.heapTaskCancellationDecreaseMinThreashold = + config.getMinTaskHeapCancellationPercentageThreshold(); } } diff --git a/src/test/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressureRcaTest.java b/src/test/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressureRcaTest.java index 2f38bb2e4..de6014fb4 100644 --- a/src/test/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressureRcaTest.java +++ b/src/test/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressureRcaTest.java @@ -120,7 +120,7 @@ public void testSearchBpGetResourceContextEqualRcaPeriod() { /* * Test SearchBackPressure RCA returns healthy nonempty flow units if the settings does not trigger autotune - * Meeting None of Increasing or Decreasing Threshold + * Meeting None of Increasing or Decreasing Threshold for both shard/task level */ @Test public void testSearchBpGetHealthyFlowUnit() { @@ -138,13 +138,49 @@ public void testSearchBpGetHealthyFlowUnit() { * Test SearchBackPressure RCA returns unhealthy nonempty flow units if the settings does trigger autotune by increasing threshold * Increasing threshold: * node max heap usage in last 60 secs is less than 70% - * cancellationCount due to heap is more than 50% of all task cancellations. + * cancellationCount due to heap is more than 50% of all task cancellations (Shard-Level) */ @Test - public void testSearchBpGetUnHealthyFlowUnitByIncreaseThreshold() { + public void testSearchBpGetUnHealthyFlowUnitByShardIncreaseThreshold() { setupMockHeapMetric(mockHeapMax, DEFAULT_MAX_HEAP_SIZE); setupMockHeapMetric(mockHeapUsed, DEFAULT_MAX_HEAP_SIZE * 0.3); - setupMockSearchbpStats(mockSearchbpStats, 10.0, 10.0, 8.0, 7.0); + setupMockSearchbpStats(mockSearchbpStats, 10.0, 10.0, 8.0, 4.0); + IntStream.range(0, RCA_PERIOD - 1).forEach(i -> testRca.operate()); + + ResourceFlowUnit flowUnit = testRca.operate(); + assertFalse(flowUnit.isEmpty()); + assertFalse(flowUnit.getResourceContext().isHealthy()); + } + + /* + * Test SearchBackPressure RCA returns unhealthy nonempty flow units if the settings does trigger autotune by increasing threshold + * Increasing threshold: + * node max heap usage in last 60 secs is less than 70% + * cancellationCount due to heap is more than 50% of all task cancellations (Task-Level). + */ + @Test + public void testSearchBpGetUnHealthyFlowUnitByTaskIncreaseThreshold() { + setupMockHeapMetric(mockHeapMax, DEFAULT_MAX_HEAP_SIZE); + setupMockHeapMetric(mockHeapUsed, DEFAULT_MAX_HEAP_SIZE * 0.3); + setupMockSearchbpStats(mockSearchbpStats, 10.0, 10.0, 4.0, 8.0); + IntStream.range(0, RCA_PERIOD - 1).forEach(i -> testRca.operate()); + + ResourceFlowUnit flowUnit = testRca.operate(); + assertFalse(flowUnit.isEmpty()); + assertFalse(flowUnit.getResourceContext().isHealthy()); + } + + /* + * Test SearchBackPressure RCA returns unhealthy nonempty flow units if the settings does trigger autotune by decreasing threshold + * decreasing threshold: + * node min heap usage in last 60 secs is more than 80% + * cancellationCount due to heap is less than 30% of all task cancellations (Shard-Level) + */ + @Test + public void testSearchBpGetUnHealthyFlowUnitByShardDecreaseThreshold() { + setupMockHeapMetric(mockHeapMax, DEFAULT_MAX_HEAP_SIZE); + setupMockHeapMetric(mockHeapUsed, DEFAULT_MAX_HEAP_SIZE * 0.9); + setupMockSearchbpStats(mockSearchbpStats, 10.0, 10.0, 2.0, 8.0); IntStream.range(0, RCA_PERIOD - 1).forEach(i -> testRca.operate()); ResourceFlowUnit flowUnit = testRca.operate(); @@ -156,13 +192,13 @@ public void testSearchBpGetUnHealthyFlowUnitByIncreaseThreshold() { * Test SearchBackPressure RCA returns unhealthy nonempty flow units if the settings does trigger autotune by decreasing threshold * decreasing threshold: * node min heap usage in last 60 secs is more than 80% - * cancellationCount due to heap is less than 30% of all task cancellations + * cancellationCount due to heap is less than 30% of all task cancellations (Task-Level) */ @Test - public void testSearchBpGetUnHealthyFlowUnitByDecreaseThreshold() { + public void testSearchBpGetUnHealthyFlowUnitByTaskDecreaseThreshold() { setupMockHeapMetric(mockHeapMax, DEFAULT_MAX_HEAP_SIZE); setupMockHeapMetric(mockHeapUsed, DEFAULT_MAX_HEAP_SIZE * 0.9); - setupMockSearchbpStats(mockSearchbpStats, 10.0, 10.0, 2.0, 2.0); + setupMockSearchbpStats(mockSearchbpStats, 10.0, 10.0, 8.0, 2.0); IntStream.range(0, RCA_PERIOD - 1).forEach(i -> testRca.operate()); ResourceFlowUnit flowUnit = testRca.operate(); From 5e3aed707109da92990b97b52bf1291dc35d0d65 Mon Sep 17 00:00:00 2001 From: CoderJeffrey Date: Fri, 30 Jun 2023 00:12:49 -0700 Subject: [PATCH 17/73] Add a new SearchBp Resource Unit (Signed-off-by: Jeffrey Liu ujeffliu@amazon.com) Signed-off-by: CoderJeffrey --- .../framework/api/summaries/ResourceUtil.java | 20 +++++++++++++++++++ .../SearchBackPressureRCA.java | 11 +++++++++- src/main/proto/inter_node_rpc_service.proto | 6 ++++++ .../SearchBackPressureRcaTest.java | 17 ++++++++++++++++ 4 files changed, 53 insertions(+), 1 deletion(-) diff --git a/src/main/java/org/opensearch/performanceanalyzer/rca/framework/api/summaries/ResourceUtil.java b/src/main/java/org/opensearch/performanceanalyzer/rca/framework/api/summaries/ResourceUtil.java index 876cd4ca1..659f85548 100644 --- a/src/main/java/org/opensearch/performanceanalyzer/rca/framework/api/summaries/ResourceUtil.java +++ b/src/main/java/org/opensearch/performanceanalyzer/rca/framework/api/summaries/ResourceUtil.java @@ -135,6 +135,26 @@ public class ResourceUtil { .setResourceEnum(ResourceEnum.SHARD_REQUEST_CACHE) .setMetricEnum(MetricEnum.CACHE_MAX_SIZE) .build(); + /* + * searchbackpressure related resource + * SEARCHBACKPRESSURE_SHARD resource indicate a searchbackpressure unhealthy resource unit is caused by shard level cancellation + * ResourceEnum.OLD_GEN and MetricEnum.UNRECOGNIZED are dummy values + */ + public static final Resource SEARCHBACKPRESSURE_SHARD = + Resource.newBuilder() + .setResourceEnum(ResourceEnum.SEARCHBP) + .setMetricEnum(MetricEnum.SEARCHBP_SHARD) + .build(); + + /* + * SEARCHBACKPRESSURE_TASK resource indicate a searchbackpressure unhealthy resource unit is caused by task level cancellation + * ResourceEnum.OLD_GEN and MetricEnum.UNRECOGNIZED are dummy values + */ + public static final Resource SEARCHBACKPRESSURE_TASK = + Resource.newBuilder() + .setResourceEnum(ResourceEnum.SEARCHBP) + .setMetricEnum(MetricEnum.SEARCHBP_TASK) + .build(); /** * Read the resourceType name from the ResourceType object diff --git a/src/main/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressureRCA.java b/src/main/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressureRCA.java index aabbd1fc2..882c39d7a 100644 --- a/src/main/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressureRCA.java +++ b/src/main/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressureRCA.java @@ -6,6 +6,8 @@ package org.opensearch.performanceanalyzer.rca.store.rca.searchbackpressure; import static org.opensearch.performanceanalyzer.rca.framework.api.persist.SQLParsingUtil.readDataFromSqlResult; +import static org.opensearch.performanceanalyzer.rca.framework.api.summaries.ResourceUtil.SEARCHBACKPRESSURE_SHARD; +import static org.opensearch.performanceanalyzer.rca.framework.api.summaries.ResourceUtil.SEARCHBACKPRESSURE_TASK; import java.time.Clock; import java.util.ArrayList; @@ -27,6 +29,7 @@ import org.opensearch.performanceanalyzer.rca.framework.api.flow_units.MetricFlowUnit; import org.opensearch.performanceanalyzer.rca.framework.api.flow_units.ResourceFlowUnit; import org.opensearch.performanceanalyzer.rca.framework.api.summaries.HotNodeSummary; +import org.opensearch.performanceanalyzer.rca.framework.api.summaries.HotResourceSummary; import org.opensearch.performanceanalyzer.rca.framework.core.RcaConf; import org.opensearch.performanceanalyzer.rca.framework.util.InstanceDetails; import org.opensearch.performanceanalyzer.rca.scheduler.FlowUnitOperationArgWrapper; @@ -140,7 +143,6 @@ public ResourceFlowUnit operate() { counter += 1; ResourceContext context = null; long currentTimeMillis = System.currentTimeMillis(); - ; // read key metrics into searchBackPressureRCAMetric for easier management SearchBackPressureRCAMetric searchBackPressureRCAMetric = getSearchBackPressureRCAMetric(); @@ -256,6 +258,10 @@ public ResourceFlowUnit operate() { context = new ResourceContext(Resources.State.UNHEALTHY); // add an additional resource with metadata: shard-level + HotResourceSummary resourceSummary = + new HotResourceSummary(SEARCHBACKPRESSURE_SHARD, 0, 0, 0); + nodeSummary.appendNestedSummary(resourceSummary); + return new ResourceFlowUnit<>( currentTimeMillis, context, @@ -272,6 +278,9 @@ public ResourceFlowUnit operate() { context = new ResourceContext(Resources.State.UNHEALTHY); // add an additional resource with metadata: task-level + HotResourceSummary resourceSummary = + new HotResourceSummary(SEARCHBACKPRESSURE_TASK, 0, 0, 0); + nodeSummary.appendNestedSummary(resourceSummary); return new ResourceFlowUnit<>( currentTimeMillis, context, diff --git a/src/main/proto/inter_node_rpc_service.proto b/src/main/proto/inter_node_rpc_service.proto index fe5c864c9..89ea45d93 100644 --- a/src/main/proto/inter_node_rpc_service.proto +++ b/src/main/proto/inter_node_rpc_service.proto @@ -77,6 +77,9 @@ enum ResourceEnum { // Heap HEAP = 20 [(additional_fields).name = "heap"]; + // Search Back Pressure + SEARCHBP = 21 [(additional_fields).name = "search back pressure"]; + } enum MetricEnum { @@ -106,6 +109,9 @@ enum MetricEnum { OLD_GEN_USAGE_AFTER_FULL_GC = 31 [(additional_fields).name = "full gc", (additional_fields).description = "old gen usage after full gc in mb"]; // GC FULL_GC = 32 [(additional_fields).name = "full gc", (additional_fields).description = "full gc pause time in ms"]; + // Searchbp + SEARCHBP_SHARD = 33 [(additional_fields).name = "searchbackpressure shard", (additional_fields).description = "default value to indicate an unhealthy resource unit is from shard-level cancellation"]; + SEARCHBP_TASK = 34 [(additional_fields).name = "searchbackpressure task", (additional_fields).description = "default value to indicate an unhealthy resource unit is from task-level cancellation"]; } /* diff --git a/src/test/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressureRcaTest.java b/src/test/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressureRcaTest.java index de6014fb4..344dcb9db 100644 --- a/src/test/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressureRcaTest.java +++ b/src/test/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressureRcaTest.java @@ -206,6 +206,23 @@ public void testSearchBpGetUnHealthyFlowUnitByTaskDecreaseThreshold() { assertFalse(flowUnit.getResourceContext().isHealthy()); } + + /* + * Test SearchBackPressure RCA returns unhealthy nonempty flow units with a HotResourceSummary of SEARCHBACKPRESSURE_SHARD Resource + * indicating the autotune (unhealthy resource unit) is caused by meeting the threshold in shard-level + */ + @Test + public void testSearchBpGetUnHealthyFlowUnitByTaskDecreaseThreshold() { + setupMockHeapMetric(mockHeapMax, DEFAULT_MAX_HEAP_SIZE); + setupMockHeapMetric(mockHeapUsed, DEFAULT_MAX_HEAP_SIZE * 0.9); + setupMockSearchbpStats(mockSearchbpStats, 10.0, 10.0, 8.0, 2.0); + IntStream.range(0, RCA_PERIOD - 1).forEach(i -> testRca.operate()); + + ResourceFlowUnit flowUnit = testRca.operate(); + assertFalse(flowUnit.isEmpty()); + assertFalse(flowUnit.getResourceContext().isHealthy()); + } + private void setupMockHeapMetric(final Metric metric, final double val) { String valString = Double.toString(val); List data = From 8d78c3b9f2fb0f40bcdf33f5acacfbb650097d74 Mon Sep 17 00:00:00 2001 From: CoderJeffrey Date: Fri, 30 Jun 2023 09:51:33 -0700 Subject: [PATCH 18/73] Add UTs to test shard/task level resource include-ness (Signed-off-by: Jeffrey Liu ujeffliu@amazon.com) Signed-off-by: CoderJeffrey --- .../SearchBackPressureRcaTest.java | 112 +++++++++++++++++- 1 file changed, 109 insertions(+), 3 deletions(-) diff --git a/src/test/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressureRcaTest.java b/src/test/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressureRcaTest.java index 344dcb9db..ffc80d876 100644 --- a/src/test/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressureRcaTest.java +++ b/src/test/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressureRcaTest.java @@ -9,6 +9,8 @@ import static org.junit.Assert.assertTrue; import static org.mockito.Mockito.when; import static org.mockito.MockitoAnnotations.initMocks; +import static org.opensearch.performanceanalyzer.rca.framework.api.summaries.ResourceUtil.SEARCHBACKPRESSURE_SHARD; +import static org.opensearch.performanceanalyzer.rca.framework.api.summaries.ResourceUtil.SEARCHBACKPRESSURE_TASK; import java.util.Arrays; import java.util.Collections; @@ -24,6 +26,7 @@ import org.opensearch.performanceanalyzer.rca.framework.api.flow_units.ResourceFlowUnit; import org.opensearch.performanceanalyzer.rca.framework.api.metrics.MetricTestHelper; import org.opensearch.performanceanalyzer.rca.framework.api.summaries.HotNodeSummary; +import org.opensearch.performanceanalyzer.rca.framework.api.summaries.HotResourceSummary; public class SearchBackPressureRcaTest { // Mock Metrics @@ -206,13 +209,75 @@ public void testSearchBpGetUnHealthyFlowUnitByTaskDecreaseThreshold() { assertFalse(flowUnit.getResourceContext().isHealthy()); } + /* + * Test SearchBackPressure RCA returns unhealthy nonempty flow units with a HotResourceSummary of SEARCHBACKPRESSURE_SHARD Resource + * indicating the autotune (unhealthy resource unit) is caused by meeting the threshold in shard-level in decrease threshold + * decreasing threshold: + * node min heap usage in last 60 secs is more than 80% + * cancellationCount due to heap is less than 30% of all task cancellations (Shard-Level) + */ + @Test + public void testSearchBpGetUnHealthyFlowUnitInShardLevelByDecreaseThreshold() { + setupMockHeapMetric(mockHeapMax, DEFAULT_MAX_HEAP_SIZE); + setupMockHeapMetric(mockHeapUsed, DEFAULT_MAX_HEAP_SIZE * 0.9); + setupMockSearchbpStats(mockSearchbpStats, 10.0, 10.0, 2.0, 8.0); + IntStream.range(0, RCA_PERIOD - 1).forEach(i -> testRca.operate()); + + ResourceFlowUnit flowUnit = testRca.operate(); + assertFalse(flowUnit.isEmpty()); + assertFalse(flowUnit.getResourceContext().isHealthy()); - /* + HotNodeSummary hotNodeSummary = flowUnit.getSummary(); + List hotResourceSummaries = hotNodeSummary.getHotResourceSummaryList(); + boolean found_shard_resource = + hotResourceSummaries.stream() + .anyMatch( + hotResourceSummary -> + hotResourceSummary.getResource() + == SEARCHBACKPRESSURE_SHARD); + + assertTrue(found_shard_resource); + } + + /* * Test SearchBackPressure RCA returns unhealthy nonempty flow units with a HotResourceSummary of SEARCHBACKPRESSURE_SHARD Resource - * indicating the autotune (unhealthy resource unit) is caused by meeting the threshold in shard-level + * indicating the autotune (unhealthy resource unit) is caused by meeting the threshold in shard-level in increase threshold + * Increasing threshold: + * node max heap usage in last 60 secs is less than 70% + * cancellationCount due to heap is more than 50% of all task cancellations (Shard-Level) */ @Test - public void testSearchBpGetUnHealthyFlowUnitByTaskDecreaseThreshold() { + public void testSearchBpGetUnHealthyFlowUnitInShardLevelByIncreaseThreshold() { + setupMockHeapMetric(mockHeapMax, DEFAULT_MAX_HEAP_SIZE); + setupMockHeapMetric(mockHeapUsed, DEFAULT_MAX_HEAP_SIZE * 0.5); + setupMockSearchbpStats(mockSearchbpStats, 10.0, 10.0, 8.0, 2.0); + IntStream.range(0, RCA_PERIOD - 1).forEach(i -> testRca.operate()); + + ResourceFlowUnit flowUnit = testRca.operate(); + assertFalse(flowUnit.isEmpty()); + assertFalse(flowUnit.getResourceContext().isHealthy()); + + HotNodeSummary hotNodeSummary = flowUnit.getSummary(); + List hotResourceSummaries = hotNodeSummary.getHotResourceSummaryList(); + boolean found_shard_resource = + hotResourceSummaries.stream() + .anyMatch( + hotResourceSummary -> + hotResourceSummary.getResource() + == SEARCHBACKPRESSURE_SHARD); + + assertTrue(found_shard_resource); + } + + /* + * Test SearchBackPressure RCA returns unhealthy nonempty flow units with a HotResourceSummary of SEARCHBACKPRESSURE_SHARD Resource + * indicating the autotune (unhealthy resource unit) is caused by meeting the threshold in task-level + * decreasing threshold: + * node min heap usage in last 60 secs is more than 80% + * cancellationCount due to heap is less than 30% of all task cancellations (Task-Level) + */ + @Test + public void testSearchBpGetUnHealthyFlowUnitInTaskLevelByDecreaseThreshold() { setupMockHeapMetric(mockHeapMax, DEFAULT_MAX_HEAP_SIZE); setupMockHeapMetric(mockHeapUsed, DEFAULT_MAX_HEAP_SIZE * 0.9); setupMockSearchbpStats(mockSearchbpStats, 10.0, 10.0, 8.0, 2.0); @@ -221,6 +286,47 @@ public void testSearchBpGetUnHealthyFlowUnitByTaskDecreaseThreshold() { ResourceFlowUnit flowUnit = testRca.operate(); assertFalse(flowUnit.isEmpty()); assertFalse(flowUnit.getResourceContext().isHealthy()); + + HotNodeSummary hotNodeSummary = flowUnit.getSummary(); + List hotResourceSummaries = hotNodeSummary.getHotResourceSummaryList(); + boolean found_task_resource = + hotResourceSummaries.stream() + .anyMatch( + hotResourceSummary -> + hotResourceSummary.getResource() + == SEARCHBACKPRESSURE_TASK); + + assertTrue(found_task_resource); + } + + /* + * Test SearchBackPressure RCA returns unhealthy nonempty flow units with a HotResourceSummary of SEARCHBACKPRESSURE_SHARD Resource + * indicating the autotune (unhealthy resource unit) is caused by meeting the threshold in shard-level + * Increasing threshold: + * node max heap usage in last 60 secs is less than 70% + * cancellationCount due to heap is more than 50% of all task cancellations (Task-Level) + */ + @Test + public void testSearchBpGetUnHealthyFlowUnitInTaskLevelByIncreaseThreshold() { + setupMockHeapMetric(mockHeapMax, DEFAULT_MAX_HEAP_SIZE); + setupMockHeapMetric(mockHeapUsed, DEFAULT_MAX_HEAP_SIZE * 0.5); + setupMockSearchbpStats(mockSearchbpStats, 10.0, 10.0, 2.0, 8.0); + IntStream.range(0, RCA_PERIOD - 1).forEach(i -> testRca.operate()); + + ResourceFlowUnit flowUnit = testRca.operate(); + assertFalse(flowUnit.isEmpty()); + assertFalse(flowUnit.getResourceContext().isHealthy()); + + HotNodeSummary hotNodeSummary = flowUnit.getSummary(); + List hotResourceSummaries = hotNodeSummary.getHotResourceSummaryList(); + boolean found_task_resource = + hotResourceSummaries.stream() + .anyMatch( + hotResourceSummary -> + hotResourceSummary.getResource() + == SEARCHBACKPRESSURE_TASK); + + assertTrue(found_task_resource); } private void setupMockHeapMetric(final Metric metric, final double val) { From 55b8ec0828c29f09e03d41ad174a1764c2d3c5ba Mon Sep 17 00:00:00 2001 From: CoderJeffrey Date: Mon, 3 Jul 2023 15:03:51 -0700 Subject: [PATCH 19/73] Remove styling changes for Version.java (Signed-off-by: Jeffrey Liu ujeffliu@amazon.com) Signed-off-by: CoderJeffrey --- .../org/opensearch/performanceanalyzer/rca/Version.java | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/src/main/java/org/opensearch/performanceanalyzer/rca/Version.java b/src/main/java/org/opensearch/performanceanalyzer/rca/Version.java index 402013cf7..bfc85fcd3 100644 --- a/src/main/java/org/opensearch/performanceanalyzer/rca/Version.java +++ b/src/main/java/org/opensearch/performanceanalyzer/rca/Version.java @@ -19,14 +19,11 @@ public final class Version { * transferred packets should be dropped. Every increment here should be accompanied with a line * describing the version bump. * - *

Note: The RCA version is agnostic of OpenSearch version. + * Note: The RCA version is agnostic of OpenSearch version. */ static final class Major { - // Bumping this post the Commons - // Lib(https://github.com/opensearch-project/performance-analyzer-commons/issues/2) - // and Service - // Metrics(https://github.com/opensearch-project/performance-analyzer-commons/issues/8) - // change + // Bumping this post the Commons Lib(https://github.com/opensearch-project/performance-analyzer-commons/issues/2) + // and Service Metrics(https://github.com/opensearch-project/performance-analyzer-commons/issues/8) change static final int RCA_MAJ_VERSION = 1; } From 12fe8a8ac7bbcc8a66e9ee3af7b9bbc4f12d2cb8 Mon Sep 17 00:00:00 2001 From: CoderJeffrey Date: Wed, 5 Jul 2023 19:54:27 -0700 Subject: [PATCH 20/73] Add metadata to resourceSummary (Signed-off-by: Jeffrey Liu ujeffliu@amazon.com) Signed-off-by: CoderJeffrey --- .../SearchBackPressureRCA.java | 22 +++++++++++--- .../SearchBackPressureRcaTest.java | 30 ++++++++++++------- 2 files changed, 38 insertions(+), 14 deletions(-) diff --git a/src/main/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressureRCA.java b/src/main/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressureRCA.java index 882c39d7a..c8d095b9b 100644 --- a/src/main/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressureRCA.java +++ b/src/main/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressureRCA.java @@ -258,8 +258,15 @@ public ResourceFlowUnit operate() { context = new ResourceContext(Resources.State.UNHEALTHY); // add an additional resource with metadata: shard-level - HotResourceSummary resourceSummary = - new HotResourceSummary(SEARCHBACKPRESSURE_SHARD, 0, 0, 0); + HotResourceSummary resourceSummary; + if (increaseThresholdMetByShard) { + resourceSummary = + new HotResourceSummary(SEARCHBACKPRESSURE_SHARD, 0, 0, 0, "increase"); + } else { + resourceSummary = + new HotResourceSummary(SEARCHBACKPRESSURE_SHARD, 0, 0, 0, "decrease"); + } + nodeSummary.appendNestedSummary(resourceSummary); return new ResourceFlowUnit<>( @@ -278,8 +285,15 @@ public ResourceFlowUnit operate() { context = new ResourceContext(Resources.State.UNHEALTHY); // add an additional resource with metadata: task-level - HotResourceSummary resourceSummary = - new HotResourceSummary(SEARCHBACKPRESSURE_TASK, 0, 0, 0); + HotResourceSummary resourceSummary; + if (increaseThresholdMetByTask) { + resourceSummary = + new HotResourceSummary(SEARCHBACKPRESSURE_TASK, 0, 0, 0, "increase"); + } else { + resourceSummary = + new HotResourceSummary(SEARCHBACKPRESSURE_TASK, 0, 0, 0, "decrease"); + } + nodeSummary.appendNestedSummary(resourceSummary); return new ResourceFlowUnit<>( currentTimeMillis, diff --git a/src/test/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressureRcaTest.java b/src/test/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressureRcaTest.java index ffc80d876..355f757cb 100644 --- a/src/test/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressureRcaTest.java +++ b/src/test/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressureRcaTest.java @@ -44,6 +44,8 @@ public class SearchBackPressureRcaTest { private SearchBackPressureRCA testRca; private MetricTestHelper metricTestHelper; private static final double DEFAULT_MAX_HEAP_SIZE = 4294967296.0; + private static final String INCREASE_METADATA_STR = "increase"; + private static final String DECREASE_METADATA_STR = "decrease"; // mock heap metric columns private final List heapTableColumns = @@ -233,8 +235,10 @@ public void testSearchBpGetUnHealthyFlowUnitInShardLevelByDecreaseThreshold() { hotResourceSummaries.stream() .anyMatch( hotResourceSummary -> - hotResourceSummary.getResource() - == SEARCHBACKPRESSURE_SHARD); + (hotResourceSummary.getResource() + == SEARCHBACKPRESSURE_SHARD) + && (hotResourceSummary.getMetaData() + == DECREASE_METADATA_STR)); assertTrue(found_shard_resource); } @@ -259,14 +263,16 @@ public void testSearchBpGetUnHealthyFlowUnitInShardLevelByIncreaseThreshold() { HotNodeSummary hotNodeSummary = flowUnit.getSummary(); List hotResourceSummaries = hotNodeSummary.getHotResourceSummaryList(); - boolean found_shard_resource = + boolean found_shard_resource_and_increase_metadata = hotResourceSummaries.stream() .anyMatch( hotResourceSummary -> - hotResourceSummary.getResource() - == SEARCHBACKPRESSURE_SHARD); + (hotResourceSummary.getResource() + == SEARCHBACKPRESSURE_SHARD) + && (hotResourceSummary.getMetaData() + == INCREASE_METADATA_STR)); - assertTrue(found_shard_resource); + assertTrue(found_shard_resource_and_increase_metadata); } /* @@ -293,8 +299,10 @@ public void testSearchBpGetUnHealthyFlowUnitInTaskLevelByDecreaseThreshold() { hotResourceSummaries.stream() .anyMatch( hotResourceSummary -> - hotResourceSummary.getResource() - == SEARCHBACKPRESSURE_TASK); + (hotResourceSummary.getResource() + == SEARCHBACKPRESSURE_TASK) + && (hotResourceSummary.getMetaData() + == DECREASE_METADATA_STR)); assertTrue(found_task_resource); } @@ -323,8 +331,10 @@ public void testSearchBpGetUnHealthyFlowUnitInTaskLevelByIncreaseThreshold() { hotResourceSummaries.stream() .anyMatch( hotResourceSummary -> - hotResourceSummary.getResource() - == SEARCHBACKPRESSURE_TASK); + (hotResourceSummary.getResource() + == SEARCHBACKPRESSURE_TASK) + && (hotResourceSummary.getMetaData() + == INCREASE_METADATA_STR)); assertTrue(found_task_resource); } From 1b7837d9cd4480cdcdacce6a890ebd0185a15496 Mon Sep 17 00:00:00 2001 From: CoderJeffrey Date: Thu, 6 Jul 2023 11:24:00 -0700 Subject: [PATCH 21/73] Update to more general framework (Signed-off-by: Jeffrey Liu ujeffliu@amazon.com) Signed-off-by: CoderJeffrey --- .../configs/SearchBackPressureRcaConfig.java | 4 + .../SearchBackPressureRCA.java | 90 ++++++++++++------- .../SearchBackPressureRcaTest.java | 15 ++-- 3 files changed, 70 insertions(+), 39 deletions(-) diff --git a/src/main/java/org/opensearch/performanceanalyzer/rca/configs/SearchBackPressureRcaConfig.java b/src/main/java/org/opensearch/performanceanalyzer/rca/configs/SearchBackPressureRcaConfig.java index 9a6512893..a3dca6031 100644 --- a/src/main/java/org/opensearch/performanceanalyzer/rca/configs/SearchBackPressureRcaConfig.java +++ b/src/main/java/org/opensearch/performanceanalyzer/rca/configs/SearchBackPressureRcaConfig.java @@ -11,6 +11,10 @@ public class SearchBackPressureRcaConfig { public static final String CONFIG_NAME = "search-back-pressure-rca-policy"; + /* Metadata fields for thresholds */ + public static final String INCREASE_THRESHOLD_BY_JVM_STR = "increase_jvm"; + public static final String DECREASE_THRESHOLD_BY_JVM_STR = "decrease_jvm"; + // Interval period in seconds public static final long DEFAULT_EVALUATION_INTERVAL_IN_S = 60; diff --git a/src/main/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressureRCA.java b/src/main/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressureRCA.java index c8d095b9b..cae86ffe5 100644 --- a/src/main/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressureRCA.java +++ b/src/main/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressureRCA.java @@ -96,22 +96,32 @@ public SearchBackPressureRCA( this.rcaPeriod = rcaPeriod; this.clock = Clock.systemUTC(); this.searchbp_Stats = searchbp_Stats; + + // threshold for heap usage this.heapUsedIncreaseThreshold = SearchBackPressureRcaConfig.DEFAULT_MAX_HEAP_INCREASE_THRESHOLD; + this.heapUsedDecreaseThreshold = + SearchBackPressureRcaConfig.DEFAULT_MIN_HEAP_DECREASE_THRESHOLD; + + /* + * threshold for search back pressure service stats + * currently, only consider the percentage of JVM Usage cancellation count compared to the total cancellation count + * + */ this.heapShardCancellationIncreaseMaxThreshold = SearchBackPressureRcaConfig.DEFAULT_SHARD_MAX_HEAP_CANCELLATION_THRESHOLD; this.heapTaskCancellationIncreaseMaxThreshold = SearchBackPressureRcaConfig.DEFAULT_TASK_MAX_HEAP_CANCELLATION_THRESHOLD; - this.heapUsedDecreaseThreshold = - SearchBackPressureRcaConfig.DEFAULT_MIN_HEAP_DECREASE_THRESHOLD; + this.heapShardCancellationDecreaseMinThreashold = SearchBackPressureRcaConfig.DEFAULT_SHARD_MIN_HEAP_CANCELLATION_THRESHOLD; this.heapTaskCancellationDecreaseMinThreashold = SearchBackPressureRcaConfig.DEFAULT_TASK_MIN_HEAP_CANCELLATION_THRESHOLD; - // initialize sliding window + // sliding window for heap usage this.heapUsageSlidingWindow = new MinMaxOldGenSlidingWindow(SLIDING_WINDOW_SIZE_IN_MINS, TimeUnit.MINUTES); + // sliding window for JVM this.shardJVMCancellationSlidingWindow = new SlidingWindow<>(SLIDING_WINDOW_SIZE_IN_MINS, TimeUnit.MINUTES); this.taskJVMCancellationSlidingWindow = @@ -122,7 +132,7 @@ public SearchBackPressureRCA( /* * operate() is used for local build - * generateFlowUnitListFromWire simply use remote flowunits to + * generateFlowUnitListFromWire simply use remote flowunits to generate flow units locally */ @Override public void generateFlowUnitListFromWire(FlowUnitOperationArgWrapper args) { @@ -136,6 +146,11 @@ public void generateFlowUnitListFromWire(FlowUnitOperationArgWrapper args) { setFlowUnits(flowUnitList); } + /* + * operate() evaluates the current stats against threshold + * generate Unhealthy Flow Unit if Searchbp Service needs autotune + * else, generate Healthy Flow Unit + */ @Override public ResourceFlowUnit operate() { LOG.info("SearchBackPressureRCA operate() intiatilized"); @@ -207,37 +222,27 @@ public ResourceFlowUnit operate() { new HotNodeSummary( instanceDetails.getInstanceId(), instanceDetails.getInstanceIp()); - // get the Configured Threshold and compare with Sliding Window Stats /* * 2 cases we send Unhealthy ResourceContext when we need to autotune the threshold - * - (increase) node max heap usage in last 60 secs is less than 70% and cancellationCountPercentage due to heap is more than 50% of all task cancellations - * - (decrease) node min heap usage in last 60 secs is more than 80% and cancellationCountPercetange due to heap is less than 30% of all task cancellations + * (increase) node max heap usage in last 60 secs is less than 70% and cancellationCountPercentage due to heap is more than 50% of all task cancellations + * (decrease) node min heap usage in last 60 secs is more than 80% and cancellationCountPercetange due to heap is less than 30% of all task cancellations */ - // avgShardJVMCancellationPercentage = 80.0; // testing - - // TODO: add Task CancellationCountPercentage as another criteria - // TODO - /* - * HotResourceSummary resourceSummary = - new HotResourceSummary(HEAP_MAX_SIZE, currentThreshold, previousThreshold, 0); - nodeSummary.appendNestedSummary(resourceSummary); - - If you - */ - boolean increaseThresholdMetByShard = + // shard level thresholds + boolean increaseJVMThresholdMetByShard = (maxHeapUsagePercentage < heapUsedIncreaseThreshold) && (avgShardJVMCancellationPercentage > heapShardCancellationIncreaseMaxThreshold); - boolean decreaseThresholdMetByShard = + boolean decreaseJVMThresholdMetByShard = (minHeapUsagePercentage > heapUsedDecreaseThreshold) && (avgShardJVMCancellationPercentage < heapShardCancellationDecreaseMinThreashold); - boolean increaseThresholdMetByTask = + // task level thresholds + boolean increaseJVMThresholdMetByTask = (maxHeapUsagePercentage < heapUsedIncreaseThreshold) && (avgTaskJVMCancellationPercentage > heapTaskCancellationIncreaseMaxThreshold); - boolean decreaseThresholdMetByTask = + boolean decreaseJVMThresholdMetByTask = (minHeapUsagePercentage > heapUsedDecreaseThreshold) && (avgTaskJVMCancellationPercentage < heapTaskCancellationDecreaseMinThreashold); @@ -247,7 +252,7 @@ public ResourceFlowUnit operate() { // previousThreshold, 0); // nodeSummary.appendNestedSummary(resourceSummary); - if (increaseThresholdMetByShard || decreaseThresholdMetByShard) { + if (increaseJVMThresholdMetByShard || decreaseJVMThresholdMetByShard) { // Generate a flow unit with an Unhealthy ResourceContext LOG.info( "Increase/Decrease Condition Meet for Shard, maxHeapUsagePercentage: {} is less than threshold: {}, avgShardJVMCancellationPercentage: {} is bigger than heapShardCancellationIncreaseMaxThreshold: {}", @@ -259,12 +264,22 @@ public ResourceFlowUnit operate() { context = new ResourceContext(Resources.State.UNHEALTHY); // add an additional resource with metadata: shard-level HotResourceSummary resourceSummary; - if (increaseThresholdMetByShard) { + if (increaseJVMThresholdMetByShard) { resourceSummary = - new HotResourceSummary(SEARCHBACKPRESSURE_SHARD, 0, 0, 0, "increase"); + new HotResourceSummary( + SEARCHBACKPRESSURE_SHARD, + 0, + 0, + 0, + SearchBackPressureRcaConfig.INCREASE_THRESHOLD_BY_JVM_STR); } else { resourceSummary = - new HotResourceSummary(SEARCHBACKPRESSURE_SHARD, 0, 0, 0, "decrease"); + new HotResourceSummary( + SEARCHBACKPRESSURE_SHARD, + 0, + 0, + 0, + SearchBackPressureRcaConfig.DECREASE_THRESHOLD_BY_JVM_STR); } nodeSummary.appendNestedSummary(resourceSummary); @@ -274,7 +289,7 @@ public ResourceFlowUnit operate() { context, nodeSummary, !instanceDetails.getIsClusterManager()); - } else if (increaseThresholdMetByTask || decreaseThresholdMetByTask) { + } else if (increaseJVMThresholdMetByTask || decreaseJVMThresholdMetByTask) { // Generate a flow unit with an Unhealthy ResourceContext LOG.info( "Increase/Decrease Condition Meet for Task, maxHeapUsagePercentage: {} is less than threshold: {}, avgShardJVMCancellationPercentage: {} is bigger than heapShardCancellationIncreaseMaxThreshold: {}", @@ -286,12 +301,22 @@ public ResourceFlowUnit operate() { context = new ResourceContext(Resources.State.UNHEALTHY); // add an additional resource with metadata: task-level HotResourceSummary resourceSummary; - if (increaseThresholdMetByTask) { + if (increaseJVMThresholdMetByTask) { resourceSummary = - new HotResourceSummary(SEARCHBACKPRESSURE_TASK, 0, 0, 0, "increase"); + new HotResourceSummary( + SEARCHBACKPRESSURE_TASK, + 0, + 0, + 0, + SearchBackPressureRcaConfig.INCREASE_THRESHOLD_BY_JVM_STR); } else { resourceSummary = - new HotResourceSummary(SEARCHBACKPRESSURE_TASK, 0, 0, 0, "decrease"); + new HotResourceSummary( + SEARCHBACKPRESSURE_TASK, + 0, + 0, + 0, + SearchBackPressureRcaConfig.DECREASE_THRESHOLD_BY_JVM_STR); } nodeSummary.appendNestedSummary(resourceSummary); @@ -390,11 +415,10 @@ private double getMetric(M metric, Field field, Strin */ @Override public void readRcaConf(RcaConf conf) { - // only initialized one time LOG.info("SearchBackPressureRCA readRcaConf() intiatilized"); final SearchBackPressureRcaConfig config = conf.getSearchBackPressureRcaConfig(); - // read anything from config file in runtime - // if not just skip it + + // threshold value read from config file this.heapUsedIncreaseThreshold = config.getMaxHeapIncreasePercentageThreshold(); this.heapShardCancellationIncreaseMaxThreshold = config.getMaxShardHeapCancellationPercentageThreshold(); diff --git a/src/test/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressureRcaTest.java b/src/test/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressureRcaTest.java index 355f757cb..c4282e9ec 100644 --- a/src/test/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressureRcaTest.java +++ b/src/test/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressureRcaTest.java @@ -21,6 +21,7 @@ import org.mockito.Mock; import org.opensearch.performanceanalyzer.commons.metrics.AllMetrics; import org.opensearch.performanceanalyzer.metricsdb.MetricsDB; +import org.opensearch.performanceanalyzer.rca.configs.SearchBackPressureRcaConfig; import org.opensearch.performanceanalyzer.rca.framework.api.Metric; import org.opensearch.performanceanalyzer.rca.framework.api.flow_units.MetricFlowUnit; import org.opensearch.performanceanalyzer.rca.framework.api.flow_units.ResourceFlowUnit; @@ -44,8 +45,6 @@ public class SearchBackPressureRcaTest { private SearchBackPressureRCA testRca; private MetricTestHelper metricTestHelper; private static final double DEFAULT_MAX_HEAP_SIZE = 4294967296.0; - private static final String INCREASE_METADATA_STR = "increase"; - private static final String DECREASE_METADATA_STR = "decrease"; // mock heap metric columns private final List heapTableColumns = @@ -238,7 +237,8 @@ public void testSearchBpGetUnHealthyFlowUnitInShardLevelByDecreaseThreshold() { (hotResourceSummary.getResource() == SEARCHBACKPRESSURE_SHARD) && (hotResourceSummary.getMetaData() - == DECREASE_METADATA_STR)); + == SearchBackPressureRcaConfig + .DECREASE_THRESHOLD_BY_JVM_STR)); assertTrue(found_shard_resource); } @@ -270,7 +270,8 @@ public void testSearchBpGetUnHealthyFlowUnitInShardLevelByIncreaseThreshold() { (hotResourceSummary.getResource() == SEARCHBACKPRESSURE_SHARD) && (hotResourceSummary.getMetaData() - == INCREASE_METADATA_STR)); + == SearchBackPressureRcaConfig + .INCREASE_THRESHOLD_BY_JVM_STR)); assertTrue(found_shard_resource_and_increase_metadata); } @@ -302,7 +303,8 @@ public void testSearchBpGetUnHealthyFlowUnitInTaskLevelByDecreaseThreshold() { (hotResourceSummary.getResource() == SEARCHBACKPRESSURE_TASK) && (hotResourceSummary.getMetaData() - == DECREASE_METADATA_STR)); + == SearchBackPressureRcaConfig + .DECREASE_THRESHOLD_BY_JVM_STR)); assertTrue(found_task_resource); } @@ -334,7 +336,8 @@ public void testSearchBpGetUnHealthyFlowUnitInTaskLevelByIncreaseThreshold() { (hotResourceSummary.getResource() == SEARCHBACKPRESSURE_TASK) && (hotResourceSummary.getMetaData() - == INCREASE_METADATA_STR)); + == SearchBackPressureRcaConfig + .INCREASE_THRESHOLD_BY_JVM_STR)); assertTrue(found_task_resource); } From 8b059a8148e5d374bb2a140c93a67dda83113ddd Mon Sep 17 00:00:00 2001 From: CoderJeffrey Date: Thu, 6 Jul 2023 13:42:56 -0700 Subject: [PATCH 22/73] (Signed-off-by: Jeffrey Liu ujeffliu@amazon.com) Signed-off-by: CoderJeffrey --- .../configs/SearchBackPressureRcaConfig.java | 5 +++++ .../SearchBackPressureRCA.java | 20 ++++--------------- 2 files changed, 9 insertions(+), 16 deletions(-) diff --git a/src/main/java/org/opensearch/performanceanalyzer/rca/configs/SearchBackPressureRcaConfig.java b/src/main/java/org/opensearch/performanceanalyzer/rca/configs/SearchBackPressureRcaConfig.java index a3dca6031..8838ce12f 100644 --- a/src/main/java/org/opensearch/performanceanalyzer/rca/configs/SearchBackPressureRcaConfig.java +++ b/src/main/java/org/opensearch/performanceanalyzer/rca/configs/SearchBackPressureRcaConfig.java @@ -15,9 +15,14 @@ public class SearchBackPressureRcaConfig { public static final String INCREASE_THRESHOLD_BY_JVM_STR = "increase_jvm"; public static final String DECREASE_THRESHOLD_BY_JVM_STR = "decrease_jvm"; + public static final int SLIDING_WINDOW_SIZE_IN_MINS = 1; + // Interval period in seconds public static final long DEFAULT_EVALUATION_INTERVAL_IN_S = 60; + /* interval period to call operate() */ + public static final long EVAL_INTERVAL_IN_S = 5; + /* Increase Threshold */ // node max heap usage in last 60 secs is less than 70% public static final int DEFAULT_MAX_HEAP_INCREASE_THRESHOLD = 70; diff --git a/src/main/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressureRCA.java b/src/main/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressureRCA.java index cae86ffe5..73a53b74d 100644 --- a/src/main/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressureRCA.java +++ b/src/main/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressureRCA.java @@ -37,12 +37,11 @@ import org.opensearch.performanceanalyzer.rca.store.rca.searchbackpressure.model.SearchBackPressureRCAMetric; public class SearchBackPressureRCA extends OldGenRca> { - // LOGGER for SearchBackPressureRCA private static final Logger LOG = LogManager.getLogger(SearchBackPressureRCA.class); private static final double BYTES_TO_GIGABYTES = Math.pow(1024, 3); - private static final long EVAL_INTERVAL_IN_S = 5; + private static final long EVAL_INTERVAL_IN_S = SearchBackPressureRcaConfig.EVAL_INTERVAL_IN_S; - // Key Metrics to be used to determine health status + // Key metrics used to determine RCA Flow Unit health status private final Metric heapUsed; private final Metric searchbp_Stats; @@ -76,7 +75,7 @@ public class SearchBackPressureRCA extends OldGenRca SearchBackPressureRCA( /* * threshold for search back pressure service stats * currently, only consider the percentage of JVM Usage cancellation count compared to the total cancellation count - * */ this.heapShardCancellationIncreaseMaxThreshold = SearchBackPressureRcaConfig.DEFAULT_SHARD_MAX_HEAP_CANCELLATION_THRESHOLD; @@ -153,8 +151,6 @@ public void generateFlowUnitListFromWire(FlowUnitOperationArgWrapper args) { */ @Override public ResourceFlowUnit operate() { - LOG.info("SearchBackPressureRCA operate() intiatilized"); - counter += 1; ResourceContext context = null; long currentTimeMillis = System.currentTimeMillis(); @@ -180,9 +176,6 @@ public ResourceFlowUnit operate() { new SlidingWindowData(currentTimeMillis, prevheapUsagePercentage)); } - // for testing - // heapUsageSlidingWindow.next(new SlidingWindowData(currentTimeMillis, 65.3)); - double shardJVMCancellationPercentage = searchBackPressureRCAMetric.getShardJVMCancellationPercent(); if (!Double.isNaN(shardJVMCancellationPercentage)) { @@ -247,11 +240,6 @@ public ResourceFlowUnit operate() { && (avgTaskJVMCancellationPercentage < heapTaskCancellationDecreaseMinThreashold); - // HotResourceSummary resourceSummary = - // new HotResourceSummary(HEAP_MAX_SIZE, currentThreshold, - // previousThreshold, 0); - // nodeSummary.appendNestedSummary(resourceSummary); - if (increaseJVMThresholdMetByShard || decreaseJVMThresholdMetByShard) { // Generate a flow unit with an Unhealthy ResourceContext LOG.info( @@ -418,7 +406,7 @@ public void readRcaConf(RcaConf conf) { LOG.info("SearchBackPressureRCA readRcaConf() intiatilized"); final SearchBackPressureRcaConfig config = conf.getSearchBackPressureRcaConfig(); - // threshold value read from config file + // threshold read from config file this.heapUsedIncreaseThreshold = config.getMaxHeapIncreasePercentageThreshold(); this.heapShardCancellationIncreaseMaxThreshold = config.getMaxShardHeapCancellationPercentageThreshold(); From c49e771553a5a869fdfd848263dad76aa8ce1b74 Mon Sep 17 00:00:00 2001 From: CoderJeffrey Date: Fri, 7 Jul 2023 14:08:58 -0700 Subject: [PATCH 23/73] Refactor the MinMaxSlidingWindow and bug fix (Signed-off-by: Jeffrey Liu ujeffliu@amazon.com) Signed-off-by: CoderJeffrey --- .../framework/api/summaries/ResourceUtil.java | 2 - .../rca/store/rca/OldGenRca.java | 59 ++++++++---- .../SearchBackPressureRCA.java | 96 +++++++++++++------ .../model/SearchBackPressureRCAMetric.java | 3 - 4 files changed, 108 insertions(+), 52 deletions(-) diff --git a/src/main/java/org/opensearch/performanceanalyzer/rca/framework/api/summaries/ResourceUtil.java b/src/main/java/org/opensearch/performanceanalyzer/rca/framework/api/summaries/ResourceUtil.java index 659f85548..03db4299b 100644 --- a/src/main/java/org/opensearch/performanceanalyzer/rca/framework/api/summaries/ResourceUtil.java +++ b/src/main/java/org/opensearch/performanceanalyzer/rca/framework/api/summaries/ResourceUtil.java @@ -138,7 +138,6 @@ public class ResourceUtil { /* * searchbackpressure related resource * SEARCHBACKPRESSURE_SHARD resource indicate a searchbackpressure unhealthy resource unit is caused by shard level cancellation - * ResourceEnum.OLD_GEN and MetricEnum.UNRECOGNIZED are dummy values */ public static final Resource SEARCHBACKPRESSURE_SHARD = Resource.newBuilder() @@ -148,7 +147,6 @@ public class ResourceUtil { /* * SEARCHBACKPRESSURE_TASK resource indicate a searchbackpressure unhealthy resource unit is caused by task level cancellation - * ResourceEnum.OLD_GEN and MetricEnum.UNRECOGNIZED are dummy values */ public static final Resource SEARCHBACKPRESSURE_TASK = Resource.newBuilder() diff --git a/src/main/java/org/opensearch/performanceanalyzer/rca/store/rca/OldGenRca.java b/src/main/java/org/opensearch/performanceanalyzer/rca/store/rca/OldGenRca.java index a53c2b7bf..147dd5db1 100644 --- a/src/main/java/org/opensearch/performanceanalyzer/rca/store/rca/OldGenRca.java +++ b/src/main/java/org/opensearch/performanceanalyzer/rca/store/rca/OldGenRca.java @@ -252,32 +252,57 @@ public double readMin() { } /** - * Sliding window to check the max/min olg gen usage within a given time frame Previous - * MinGoldGenSlidingWindow should be deprecated since it modify the sliding window size in - * next() + * Sliding window to check the max/min olg gen usage within a given time frame + * + * @param isMinSlidingWindow true if the sliding window is for min usage, false for max usage + * Provides a more general framework than MinOldGenSlidingWindow as this sliding window can + * be implemented as minSlidingWindow or maxSlidingWindow depending on the need. */ - public static class MinMaxOldGenSlidingWindow extends SlidingWindow { + public static class MinMaxSlidingWindow extends SlidingWindow { + boolean isMinSlidingWindow; - public MinMaxOldGenSlidingWindow(int SLIDING_WINDOW_SIZE_IN_TIMESTAMP, TimeUnit timeUnit) { + public MinMaxSlidingWindow( + int SLIDING_WINDOW_SIZE_IN_TIMESTAMP, + TimeUnit timeUnit, + boolean isMinSlidingWindow) { super(SLIDING_WINDOW_SIZE_IN_TIMESTAMP, timeUnit); + this.isMinSlidingWindow = isMinSlidingWindow; } - public double readMax() { - if (!windowDeque.isEmpty()) { - return windowDeque.stream() - .mapToDouble(SlidingWindowData::getValue) - .max() - .orElse(Double.NaN); + @Override + public void next(SlidingWindowData e) { + boolean pollFirstCondition; + if (isMinSlidingWindow) { + // monotonically decreasing sliding window + while (!windowDeque.isEmpty() + && windowDeque.peekFirst().getValue() >= e.getValue()) { + windowDeque.pollFirst(); + } + } else { + // monotonically increasing sliding window + while (!windowDeque.isEmpty() + && windowDeque.peekFirst().getValue() < e.getValue()) { + windowDeque.pollFirst(); + } + } + + windowDeque.addFirst(e); + while (!windowDeque.isEmpty() + && TimeUnit.MILLISECONDS.toSeconds( + e.getTimeStamp() - windowDeque.peekLast().getTimeStamp()) + > SLIDING_WINDOW_SIZE) { + windowDeque.pollLast(); } - return Double.NaN; } - public double readMin() { + /* + * read last element in the window + * if the sliding window is MinSlidingWindow then returns the min element + * else return the max element in the deque + */ + public double readLastElementInWindow() { if (!windowDeque.isEmpty()) { - return windowDeque.stream() - .mapToDouble(SlidingWindowData::getValue) - .min() - .orElse(Double.NaN); + return windowDeque.peekLast().getValue(); } return Double.NaN; } diff --git a/src/main/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressureRCA.java b/src/main/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressureRCA.java index 73a53b74d..35b1261b1 100644 --- a/src/main/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressureRCA.java +++ b/src/main/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressureRCA.java @@ -72,15 +72,18 @@ public class SearchBackPressureRCA extends OldGenRca taskJVMCancellationSlidingWindow; private final SlidingWindow shardJVMCancellationSlidingWindow; - private final MinMaxOldGenSlidingWindow heapUsageSlidingWindow; + private final MinMaxSlidingWindow minHeapUsageSlidingWindow; + private final MinMaxSlidingWindow maxHeapUsageSlidingWindow; // Sliding Window Interval - private static final int SLIDING_WINDOW_SIZE_IN_MINS = SearchBackPressureRcaConfig.SLIDING_WINDOW_SIZE_IN_MINS; + private static final int SLIDING_WINDOW_SIZE_IN_MINS = + SearchBackPressureRcaConfig.SLIDING_WINDOW_SIZE_IN_MINS; private static final int SLIDING_WINDOW_SIZE_IN_SECS = SLIDING_WINDOW_SIZE_IN_MINS * 60; - // counter to check the samples has been taken, only emit flow units when counter equals to + // currentIterationNumber to check the samples has been taken, only emit flow units when + // currentIterationNumber equals to // rcaPeriod - private long counter; + private long currentIterationNumber; // Required amount of RCA period this RCA needs to run before sending out a flowunit private final int rcaPeriod; @@ -90,6 +93,7 @@ public class SearchBackPressureRCA extends OldGenRca SearchBackPressureRCA( final int rcaPeriod, final M heapMax, final M heapUsed, M gcType, M searchbp_Stats) { + // metric gcType is needed to construct OldGenRca Class (Parent Class) super(EVAL_INTERVAL_IN_S, heapUsed, heapMax, null, gcType); this.heapUsed = heapUsed; this.rcaPeriod = rcaPeriod; @@ -117,8 +121,11 @@ public SearchBackPressureRCA( SearchBackPressureRcaConfig.DEFAULT_TASK_MIN_HEAP_CANCELLATION_THRESHOLD; // sliding window for heap usage - this.heapUsageSlidingWindow = - new MinMaxOldGenSlidingWindow(SLIDING_WINDOW_SIZE_IN_MINS, TimeUnit.MINUTES); + this.minHeapUsageSlidingWindow = + new MinMaxSlidingWindow(SLIDING_WINDOW_SIZE_IN_MINS, TimeUnit.MINUTES, true); + this.maxHeapUsageSlidingWindow = + new MinMaxSlidingWindow(SLIDING_WINDOW_SIZE_IN_MINS, TimeUnit.MINUTES, false); + // sliding window for JVM this.shardJVMCancellationSlidingWindow = new SlidingWindow<>(SLIDING_WINDOW_SIZE_IN_MINS, TimeUnit.MINUTES); @@ -130,7 +137,9 @@ public SearchBackPressureRCA( /* * operate() is used for local build - * generateFlowUnitListFromWire simply use remote flowunits to generate flow units locally + * This will compute the flow units from other hosts in the cluster + * for a given Metric and try to send the subscription requests + * to stale or new hosts in cluster if need be */ @Override public void generateFlowUnitListFromWire(FlowUnitOperationArgWrapper args) { @@ -146,12 +155,12 @@ public void generateFlowUnitListFromWire(FlowUnitOperationArgWrapper args) { /* * operate() evaluates the current stats against threshold - * generate Unhealthy Flow Unit if Searchbp Service needs autotune - * else, generate Healthy Flow Unit + * Unhealthy Flow Units is a marker that this resource at current instance is not healthy + * Autotune decision would be made by downstream classes */ @Override public ResourceFlowUnit operate() { - counter += 1; + currentIterationNumber += 1; ResourceContext context = null; long currentTimeMillis = System.currentTimeMillis(); @@ -172,7 +181,9 @@ public ResourceFlowUnit operate() { // update sliding window if the value is NOT NaN double prevheapUsagePercentage = searchBackPressureRCAMetric.getHeapUsagePercent(); if (!Double.isNaN(prevheapUsagePercentage)) { - heapUsageSlidingWindow.next( + minHeapUsageSlidingWindow.next( + new SlidingWindowData(currentTimeMillis, prevheapUsagePercentage)); + maxHeapUsageSlidingWindow.next( new SlidingWindowData(currentTimeMillis, prevheapUsagePercentage)); } @@ -190,17 +201,19 @@ public ResourceFlowUnit operate() { new SlidingWindowData(currentTimeMillis, taskJVMCancellationPercentage)); } - LOG.info("SearchBackPressureRCA counter is {}", counter); - // if counter matches the rca period, emit the flow unit - if (counter == this.rcaPeriod) { - LOG.info("SearchBackPressureRCA counter in rcaPeriod is {}", counter); + LOG.info("SearchBackPressureRCA currentIterationNumber is {}", currentIterationNumber); + // if currentIterationNumber matches the rca period, emit the flow unit + if (currentIterationNumber == this.rcaPeriod) { + LOG.info( + "SearchBackPressureRCA currentIterationNumber in rcaPeriod is {}", + currentIterationNumber); currentTimeMillis = System.currentTimeMillis(); - // reset counter - counter = 0; + // reset currentIterationNumber + currentIterationNumber = 0; - double maxHeapUsagePercentage = heapUsageSlidingWindow.readMax(); - double minHeapUsagePercentage = heapUsageSlidingWindow.readMin(); + double maxHeapUsagePercentage = maxHeapUsageSlidingWindow.readLastElementInWindow(); + double minHeapUsagePercentage = minHeapUsageSlidingWindow.readLastElementInWindow(); double avgShardJVMCancellationPercentage = shardJVMCancellationSlidingWindow.readAvg(); double avgTaskJVMCancellationPercentage = taskJVMCancellationSlidingWindow.readAvg(); @@ -240,15 +253,15 @@ public ResourceFlowUnit operate() { && (avgTaskJVMCancellationPercentage < heapTaskCancellationDecreaseMinThreashold); - if (increaseJVMThresholdMetByShard || decreaseJVMThresholdMetByShard) { - // Generate a flow unit with an Unhealthy ResourceContext - LOG.info( - "Increase/Decrease Condition Meet for Shard, maxHeapUsagePercentage: {} is less than threshold: {}, avgShardJVMCancellationPercentage: {} is bigger than heapShardCancellationIncreaseMaxThreshold: {}", - maxHeapUsagePercentage, - heapUsedIncreaseThreshold, - avgShardJVMCancellationPercentage, - heapShardCancellationIncreaseMaxThreshold); + // Generate a flow unit with an Unhealthy ResourceContext + LOG.info( + "Increase/Decrease Condition Meet for Shard, maxHeapUsagePercentage: {} is less than threshold: {}, avgShardJVMCancellationPercentage: {} is bigger than heapShardCancellationIncreaseMaxThreshold: {}", + maxHeapUsagePercentage, + heapUsedIncreaseThreshold, + avgShardJVMCancellationPercentage, + heapShardCancellationIncreaseMaxThreshold); + if (increaseJVMThresholdMetByShard || decreaseJVMThresholdMetByShard) { context = new ResourceContext(Resources.State.UNHEALTHY); // add an additional resource with metadata: shard-level HotResourceSummary resourceSummary; @@ -324,7 +337,7 @@ public ResourceFlowUnit operate() { } } else { - // return healthy state when the counter does not meet rcaPeriod + // return healthy state when the currentIterationNumber does not meet rcaPeriod LOG.info("Empty Healthy FlowUnit returned for SearchbackPressureRCA"); currentTimeMillis = System.currentTimeMillis(); return new ResourceFlowUnit<>(currentTimeMillis); @@ -383,16 +396,36 @@ private SearchBackPressureRCAMetric getSearchBackPressureRCAMetric() { } private double getMetric(M metric, Field field, String fieldName) { - double response = 0; + if (metric == null) { + throw new IllegalStateException( + "RCA: " + + this.name() + + "was not configured in the graph to " + + "take " + + metric.name() + + " as a metric. Please check the analysis graph!"); + } + + double response = 0.0; + // LOG.info( + // " metric.getFlowUnits() length is: {}, and metric name is {}", + // metric.getFlowUnits().size(), + // metric.name()); for (MetricFlowUnit flowUnit : metric.getFlowUnits()) { if (!flowUnit.isEmpty()) { + LOG.info( + "flowUnit.getData() rows size is {}", + flowUnit.getData().getValues("SearchBackPressureStats").size()); double metricResponse = readDataFromSqlResult(flowUnit.getData(), field, fieldName, MetricsDB.MAX); - if (!Double.isNaN(metricResponse) && metricResponse > 0) { + // print out the metricResponse + LOG.info("Searchbp metricResponse is: {}", metricResponse); + if (!Double.isNaN(metricResponse) && metricResponse >= 0.0) { response = metricResponse; } } } + LOG.info("Searchbp response is: {}", response); return response; } @@ -408,6 +441,9 @@ public void readRcaConf(RcaConf conf) { // threshold read from config file this.heapUsedIncreaseThreshold = config.getMaxHeapIncreasePercentageThreshold(); + LOG.info( + "SearchBackPressureRCA heapUsedIncreaseThreshold is set to {}", + this.heapUsedIncreaseThreshold); this.heapShardCancellationIncreaseMaxThreshold = config.getMaxShardHeapCancellationPercentageThreshold(); this.heapTaskCancellationIncreaseMaxThreshold = diff --git a/src/main/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/model/SearchBackPressureRCAMetric.java b/src/main/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/model/SearchBackPressureRCAMetric.java index 718c76b8f..ef74e1763 100644 --- a/src/main/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/model/SearchBackPressureRCAMetric.java +++ b/src/main/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/model/SearchBackPressureRCAMetric.java @@ -5,7 +5,6 @@ package org.opensearch.performanceanalyzer.rca.store.rca.searchbackpressure.model; -/** Represents used heap and max heap in gigabytes */ public class SearchBackPressureRCAMetric { private final double usedHeap; private final double maxHeap; @@ -14,7 +13,6 @@ public class SearchBackPressureRCAMetric { private final double searchbpJVMShardCancellationCount; private final double searchbpJVMTaskCancellationCount; - // Constructor public SearchBackPressureRCAMetric( double usedHeap, double maxHeap, @@ -30,7 +28,6 @@ public SearchBackPressureRCAMetric( this.searchbpJVMTaskCancellationCount = searchbpJVMTaskCancellationCount; } - // Getters public double getUsedHeap() { return usedHeap; } From 648e94d57240be143e063b21308032ee579f9b8a Mon Sep 17 00:00:00 2001 From: CoderJeffrey Date: Fri, 7 Jul 2023 14:38:54 -0700 Subject: [PATCH 24/73] Refactor Heap Stats Metrics Getter(Signed-off-by: Jeffrey Liu ujeffliu@amazon.com) Signed-off-by: CoderJeffrey --- .../SearchBackPressureRCA.java | 68 ++++++++++++++++++- 1 file changed, 66 insertions(+), 2 deletions(-) diff --git a/src/main/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressureRCA.java b/src/main/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressureRCA.java index 35b1261b1..40e35fa5f 100644 --- a/src/main/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressureRCA.java +++ b/src/main/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressureRCA.java @@ -28,6 +28,7 @@ import org.opensearch.performanceanalyzer.rca.framework.api.contexts.ResourceContext; import org.opensearch.performanceanalyzer.rca.framework.api.flow_units.MetricFlowUnit; import org.opensearch.performanceanalyzer.rca.framework.api.flow_units.ResourceFlowUnit; +import org.opensearch.performanceanalyzer.rca.framework.api.persist.SQLParsingUtil; import org.opensearch.performanceanalyzer.rca.framework.api.summaries.HotNodeSummary; import org.opensearch.performanceanalyzer.rca.framework.api.summaries.HotResourceSummary; import org.opensearch.performanceanalyzer.rca.framework.core.RcaConf; @@ -40,11 +41,16 @@ public class SearchBackPressureRCA extends OldGenRca SearchBackPressureRCA( // metric gcType is needed to construct OldGenRca Class (Parent Class) super(EVAL_INTERVAL_IN_S, heapUsed, heapMax, null, gcType); this.heapUsed = heapUsed; + this.heapMax = heapMax; this.rcaPeriod = rcaPeriod; this.clock = Clock.systemUTC(); this.searchbp_Stats = searchbp_Stats; @@ -344,10 +351,67 @@ public ResourceFlowUnit operate() { } } + /** + * Get the Heap Related Stats (Heap Used and Heap Size in gigabytes) + * + * @param isHeapUsed is true meaning get the value of used heap in gigabytes otherwise, meaning + * get the value of max heap in gigabytes + */ + public double getHeapStats(boolean isHeapUsed) { + double heapStats = DEFAULT_HEAP_VAL; + List heapStatsMetrics; + if (isHeapUsed == true) { + if (heap_Used == null) { + throw new IllegalStateException( + "RCA: " + + this.name() + + "was not configured in the graph to " + + "take heap_Used as a metric. Please check the analysis graph!"); + } + + heapStatsMetrics = heap_Used.getFlowUnits(); + } else { + if (heap_Max == null) { + throw new IllegalStateException( + "RCA: " + + this.name() + + "was not configured in the graph to " + + "take heap_Max as a metric. Please check the analysis graph!"); + } + + heapStatsMetrics = heap_Max.getFlowUnits(); + } + + for (MetricFlowUnit heapStatsMetric : heapStatsMetrics) { + if (heapStatsMetric.isEmpty()) { + continue; + } + + double ret = + SQLParsingUtil.readDataFromSqlResult( + heapStatsMetric.getData(), + AllMetrics.HeapDimension.MEM_TYPE.getField(), + AllMetrics.GCType.HEAP.toString(), + MetricsDB.MAX); + if (Double.isNaN(ret)) { + LOG.error( + "Failed to parse metric in FlowUnit from {}", + heap_Used.getClass().getName()); + } else { + heapStats = ret / CONVERT_BYTES_TO_MEGABYTES; + } + } + + return heapStats; + } + private SearchBackPressureRCAMetric getSearchBackPressureRCAMetric() { // Get Heap Usage related metrics - double prevHeapUsage = getOldGenUsedOrDefault(0d); - double maxHeapSize = getMaxOldGenSizeOrDefault(Double.MAX_VALUE); + double prevHeapUsage = getHeapStats(true); + double maxHeapSize = getHeapStats(false); + + // Log prevHeapUsage and maxHeapSize + LOG.info("prevHeapUsage: {}, maxHeapSize: {}", prevHeapUsage, maxHeapSize); // Get SearchBack Pressure related metrics from stats type field Field searchbp_stats_type_field = From ca6505937737339a8d84e3da95d720dab64c4aea Mon Sep 17 00:00:00 2001 From: CoderJeffrey Date: Fri, 7 Jul 2023 15:04:41 -0700 Subject: [PATCH 25/73] Refactor HeapUsed and HeapMax Getters (Signed-off-by: Jeffrey Liu ujeffliu@amazon.com) Signed-off-by: CoderJeffrey --- .../rca/store/OpenSearchAnalysisGraph.java | 5 ++- .../SearchBackPressureRCA.java | 31 +++++++------------ .../SearchBackPressureRcaTest.java | 25 ++------------- 3 files changed, 17 insertions(+), 44 deletions(-) diff --git a/src/main/java/org/opensearch/performanceanalyzer/rca/store/OpenSearchAnalysisGraph.java b/src/main/java/org/opensearch/performanceanalyzer/rca/store/OpenSearchAnalysisGraph.java index 967eded24..970db2ace 100644 --- a/src/main/java/org/opensearch/performanceanalyzer/rca/store/OpenSearchAnalysisGraph.java +++ b/src/main/java/org/opensearch/performanceanalyzer/rca/store/OpenSearchAnalysisGraph.java @@ -445,12 +445,11 @@ public void construct() { // Search Back Pressure Service RCA enabled SearchBackPressureRCA searchBackPressureRCA = - new SearchBackPressureRCA(RCA_PERIOD, heapMax, heapUsed, gcType, searchbp_Stats); + new SearchBackPressureRCA(RCA_PERIOD, heapMax, heapUsed, searchbp_Stats); searchBackPressureRCA.addTag( RcaConsts.RcaTagConstants.TAG_LOCUS, RcaConsts.RcaTagConstants.LOCUS_DATA_CLUSTER_MANAGER_NODE); - searchBackPressureRCA.addAllUpstreams( - Arrays.asList(heapMax, heapUsed, gcType, searchbp_Stats)); + searchBackPressureRCA.addAllUpstreams(Arrays.asList(heapMax, heapUsed, searchbp_Stats)); // Search Back Pressure Service Cluster RCA enabled SearchBackPressureClusterRCA searchBackPressureClusterRCA = diff --git a/src/main/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressureRCA.java b/src/main/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressureRCA.java index 40e35fa5f..a81dcd936 100644 --- a/src/main/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressureRCA.java +++ b/src/main/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressureRCA.java @@ -22,6 +22,7 @@ import org.opensearch.performanceanalyzer.metricsdb.MetricsDB; import org.opensearch.performanceanalyzer.rca.configs.SearchBackPressureRcaConfig; import org.opensearch.performanceanalyzer.rca.framework.api.Metric; +import org.opensearch.performanceanalyzer.rca.framework.api.Rca; import org.opensearch.performanceanalyzer.rca.framework.api.Resources; import org.opensearch.performanceanalyzer.rca.framework.api.aggregators.SlidingWindow; import org.opensearch.performanceanalyzer.rca.framework.api.aggregators.SlidingWindowData; @@ -34,10 +35,10 @@ import org.opensearch.performanceanalyzer.rca.framework.core.RcaConf; import org.opensearch.performanceanalyzer.rca.framework.util.InstanceDetails; import org.opensearch.performanceanalyzer.rca.scheduler.FlowUnitOperationArgWrapper; -import org.opensearch.performanceanalyzer.rca.store.rca.OldGenRca; +import org.opensearch.performanceanalyzer.rca.store.rca.OldGenRca.MinMaxSlidingWindow; import org.opensearch.performanceanalyzer.rca.store.rca.searchbackpressure.model.SearchBackPressureRCAMetric; -public class SearchBackPressureRCA extends OldGenRca> { +public class SearchBackPressureRCA extends Rca> { private static final Logger LOG = LogManager.getLogger(SearchBackPressureRCA.class); private static final double BYTES_TO_GIGABYTES = Math.pow(1024, 3); private static final long EVAL_INTERVAL_IN_S = SearchBackPressureRcaConfig.EVAL_INTERVAL_IN_S; @@ -98,9 +99,8 @@ public class SearchBackPressureRCA extends OldGenRca SearchBackPressureRCA( - final int rcaPeriod, final M heapMax, final M heapUsed, M gcType, M searchbp_Stats) { - // metric gcType is needed to construct OldGenRca Class (Parent Class) - super(EVAL_INTERVAL_IN_S, heapUsed, heapMax, null, gcType); + final int rcaPeriod, final M heapMax, final M heapUsed, M searchbp_Stats) { + super(EVAL_INTERVAL_IN_S); this.heapUsed = heapUsed; this.heapMax = heapMax; this.rcaPeriod = rcaPeriod; @@ -361,7 +361,7 @@ public double getHeapStats(boolean isHeapUsed) { double heapStats = DEFAULT_HEAP_VAL; List heapStatsMetrics; if (isHeapUsed == true) { - if (heap_Used == null) { + if (heapUsed == null) { throw new IllegalStateException( "RCA: " + this.name() @@ -369,9 +369,9 @@ public double getHeapStats(boolean isHeapUsed) { + "take heap_Used as a metric. Please check the analysis graph!"); } - heapStatsMetrics = heap_Used.getFlowUnits(); + heapStatsMetrics = heapUsed.getFlowUnits(); } else { - if (heap_Max == null) { + if (heapMax == null) { throw new IllegalStateException( "RCA: " + this.name() @@ -379,7 +379,7 @@ public double getHeapStats(boolean isHeapUsed) { + "take heap_Max as a metric. Please check the analysis graph!"); } - heapStatsMetrics = heap_Max.getFlowUnits(); + heapStatsMetrics = heapMax.getFlowUnits(); } for (MetricFlowUnit heapStatsMetric : heapStatsMetrics) { @@ -396,7 +396,7 @@ public double getHeapStats(boolean isHeapUsed) { if (Double.isNaN(ret)) { LOG.error( "Failed to parse metric in FlowUnit from {}", - heap_Used.getClass().getName()); + heapUsed.getClass().getName()); } else { heapStats = ret / CONVERT_BYTES_TO_MEGABYTES; } @@ -471,19 +471,12 @@ private double getMetric(M metric, Field field, Strin } double response = 0.0; - // LOG.info( - // " metric.getFlowUnits() length is: {}, and metric name is {}", - // metric.getFlowUnits().size(), - // metric.name()); for (MetricFlowUnit flowUnit : metric.getFlowUnits()) { if (!flowUnit.isEmpty()) { - LOG.info( - "flowUnit.getData() rows size is {}", - flowUnit.getData().getValues("SearchBackPressureStats").size()); double metricResponse = readDataFromSqlResult(flowUnit.getData(), field, fieldName, MetricsDB.MAX); - // print out the metricResponse - LOG.info("Searchbp metricResponse is: {}", metricResponse); + // // print out the metricResponse + // LOG.info("Searchbp metricResponse is: {}", metricResponse); if (!Double.isNaN(metricResponse) && metricResponse >= 0.0) { response = metricResponse; } diff --git a/src/test/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressureRcaTest.java b/src/test/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressureRcaTest.java index c4282e9ec..f371064e9 100644 --- a/src/test/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressureRcaTest.java +++ b/src/test/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressureRcaTest.java @@ -77,16 +77,12 @@ public void setup() throws Exception { this.metricTestHelper = new MetricTestHelper(RCA_PERIOD); setupMockHeapMetric(mockHeapUsed, 80.0); setupMockHeapMetric(mockHeapMax, 100.0); - // gcType is required for constructor of SearchBackPressureRCA but the exact type of gcType - // does not matter - setupMockGcType(CMS_COLLECTOR); // set up SearchBp_Stats table setupMockSearchbpStats(mockSearchbpStats, 10.0, 10.0, 8.0, 7.0); this.testRca = - new SearchBackPressureRCA( - RCA_PERIOD, mockHeapMax, mockHeapUsed, mockGcType, mockSearchbpStats); + new SearchBackPressureRCA(RCA_PERIOD, mockHeapMax, mockHeapUsed, mockSearchbpStats); } /* @@ -220,7 +216,7 @@ public void testSearchBpGetUnHealthyFlowUnitByTaskDecreaseThreshold() { @Test public void testSearchBpGetUnHealthyFlowUnitInShardLevelByDecreaseThreshold() { setupMockHeapMetric(mockHeapMax, DEFAULT_MAX_HEAP_SIZE); - setupMockHeapMetric(mockHeapUsed, DEFAULT_MAX_HEAP_SIZE * 0.9); + setupMockHeapMetric(mockHeapUsed, DEFAULT_MAX_HEAP_SIZE * 0.95); setupMockSearchbpStats(mockSearchbpStats, 10.0, 10.0, 2.0, 8.0); IntStream.range(0, RCA_PERIOD - 1).forEach(i -> testRca.operate()); @@ -346,7 +342,7 @@ private void setupMockHeapMetric(final Metric metric, final double val) { String valString = Double.toString(val); List data = Arrays.asList( - AllMetrics.GCType.OLD_GEN.toString(), + AllMetrics.GCType.HEAP.toString(), valString, valString, valString, @@ -360,21 +356,6 @@ private void setupMockHeapMetric(final Metric metric, final double val) { heapTableColumns, data)))); } - private void setupMockGcType(final String collector) { - List gcInfoTableColumns = - Arrays.asList( - AllMetrics.GCInfoDimension.MEMORY_POOL.toString(), - AllMetrics.GCInfoDimension.COLLECTOR_NAME.toString()); - List data = Arrays.asList(AllMetrics.GCType.OLD_GEN.toString(), collector); - when(mockGcType.getFlowUnits()) - .thenReturn( - Collections.singletonList( - new MetricFlowUnit( - 0, - metricTestHelper.createTestResult( - gcInfoTableColumns, data)))); - } - private void setupMockSearchbpStats( final Metric metric, final double searchbpShardCancellationCount, From 4c69fb3a9c8b767f0028d823348998e9fb62a62a Mon Sep 17 00:00:00 2001 From: CoderJeffrey Date: Fri, 7 Jul 2023 15:24:57 -0700 Subject: [PATCH 26/73] Refactor operate() (Signed-off-by: Jeffrey Liu ujeffliu@amazon.com) Signed-off-by: CoderJeffrey --- .../searchbackpressure/SearchBackPressureRCA.java | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/src/main/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressureRCA.java b/src/main/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressureRCA.java index a81dcd936..4050413c7 100644 --- a/src/main/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressureRCA.java +++ b/src/main/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressureRCA.java @@ -186,6 +186,7 @@ public ResourceFlowUnit operate() { searchBackPressureRCAMetric.getSearchbpJVMTaskCancellationCount()); // update sliding window if the value is NOT NaN + // TO DO double prevheapUsagePercentage = searchBackPressureRCAMetric.getHeapUsagePercent(); if (!Double.isNaN(prevheapUsagePercentage)) { minHeapUsageSlidingWindow.next( @@ -240,23 +241,26 @@ public ResourceFlowUnit operate() { * (increase) node max heap usage in last 60 secs is less than 70% and cancellationCountPercentage due to heap is more than 50% of all task cancellations * (decrease) node min heap usage in last 60 secs is more than 80% and cancellationCountPercetange due to heap is less than 30% of all task cancellations */ + boolean maxHeapBelowIncreaseThreshold = maxHeapUsagePercentage < heapUsedIncreaseThreshold; + boolean minHeapAboveDecreaseThreshold = minHeapUsagePercentage > heapUsedDecreaseThreshold; + // shard level thresholds boolean increaseJVMThresholdMetByShard = - (maxHeapUsagePercentage < heapUsedIncreaseThreshold) + maxHeapBelowIncreaseThreshold && (avgShardJVMCancellationPercentage > heapShardCancellationIncreaseMaxThreshold); boolean decreaseJVMThresholdMetByShard = - (minHeapUsagePercentage > heapUsedDecreaseThreshold) + minHeapAboveDecreaseThreshold && (avgShardJVMCancellationPercentage < heapShardCancellationDecreaseMinThreashold); // task level thresholds boolean increaseJVMThresholdMetByTask = - (maxHeapUsagePercentage < heapUsedIncreaseThreshold) + maxHeapBelowIncreaseThreshold && (avgTaskJVMCancellationPercentage > heapTaskCancellationIncreaseMaxThreshold); boolean decreaseJVMThresholdMetByTask = - (minHeapUsagePercentage > heapUsedDecreaseThreshold) + minHeapAboveDecreaseThreshold && (avgTaskJVMCancellationPercentage < heapTaskCancellationDecreaseMinThreashold); From cf92a617bbce21596f5a07398599df27e03aa180 Mon Sep 17 00:00:00 2001 From: CoderJeffrey Date: Fri, 7 Jul 2023 16:00:04 -0700 Subject: [PATCH 27/73] Refactor operate() and remove dead comments (Signed-off-by: Jeffrey Liu ujeffliu@amazon.com) Signed-off-by: CoderJeffrey --- .../SearchBackPressureRCA.java | 99 ++++++++++--------- 1 file changed, 51 insertions(+), 48 deletions(-) diff --git a/src/main/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressureRCA.java b/src/main/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressureRCA.java index 4050413c7..e8a50d064 100644 --- a/src/main/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressureRCA.java +++ b/src/main/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressureRCA.java @@ -153,7 +153,7 @@ public void generateFlowUnitListFromWire(FlowUnitOperationArgWrapper args) { final List flowUnitMessages = args.getWireHopper().readFromWire(args.getNode()); final List> flowUnitList = new ArrayList<>(); - LOG.debug("rca: Executing fromWire: {}", this.getClass().getSimpleName()); + LOG.info("rca: Executing fromWire: {}", this.getClass().getSimpleName()); for (FlowUnitMessage flowUnitMessage : flowUnitMessages) { flowUnitList.add(ResourceFlowUnit.buildFlowUnitFromWrapper(flowUnitMessage)); } @@ -185,29 +185,7 @@ public ResourceFlowUnit operate() { searchBackPressureRCAMetric.getSearchbpJVMShardCancellationCount(), searchBackPressureRCAMetric.getSearchbpJVMTaskCancellationCount()); - // update sliding window if the value is NOT NaN - // TO DO - double prevheapUsagePercentage = searchBackPressureRCAMetric.getHeapUsagePercent(); - if (!Double.isNaN(prevheapUsagePercentage)) { - minHeapUsageSlidingWindow.next( - new SlidingWindowData(currentTimeMillis, prevheapUsagePercentage)); - maxHeapUsageSlidingWindow.next( - new SlidingWindowData(currentTimeMillis, prevheapUsagePercentage)); - } - - double shardJVMCancellationPercentage = - searchBackPressureRCAMetric.getShardJVMCancellationPercent(); - if (!Double.isNaN(shardJVMCancellationPercentage)) { - shardJVMCancellationSlidingWindow.next( - new SlidingWindowData(currentTimeMillis, shardJVMCancellationPercentage)); - } - - double taskJVMCancellationPercentage = - searchBackPressureRCAMetric.getTaskJVMCancellationPercent(); - if (!Double.isNaN(taskJVMCancellationPercentage)) { - taskJVMCancellationSlidingWindow.next( - new SlidingWindowData(currentTimeMillis, taskJVMCancellationPercentage)); - } + updateAllSlidingWindows(searchBackPressureRCAMetric, currentTimeMillis); LOG.info("SearchBackPressureRCA currentIterationNumber is {}", currentIterationNumber); // if currentIterationNumber matches the rca period, emit the flow unit @@ -241,28 +219,30 @@ public ResourceFlowUnit operate() { * (increase) node max heap usage in last 60 secs is less than 70% and cancellationCountPercentage due to heap is more than 50% of all task cancellations * (decrease) node min heap usage in last 60 secs is more than 80% and cancellationCountPercetange due to heap is less than 30% of all task cancellations */ - boolean maxHeapBelowIncreaseThreshold = maxHeapUsagePercentage < heapUsedIncreaseThreshold; - boolean minHeapAboveDecreaseThreshold = minHeapUsagePercentage > heapUsedDecreaseThreshold; - + boolean maxHeapBelowIncreaseThreshold = + maxHeapUsagePercentage < heapUsedIncreaseThreshold; + boolean minHeapAboveDecreaseThreshold = + minHeapUsagePercentage > heapUsedDecreaseThreshold; + boolean shardHeapCancellationPercentageAboveThreshold = + avgShardJVMCancellationPercentage > heapShardCancellationIncreaseMaxThreshold; + boolean shardHeapCancellationPercentageBelowThreshold = + avgShardJVMCancellationPercentage < heapShardCancellationDecreaseMinThreashold; + boolean taskHeapCancellationPercentageAboveThreshold = + avgTaskJVMCancellationPercentage > heapTaskCancellationIncreaseMaxThreshold; + boolean taskHeapCancellationPercentageBelowThreshold = + avgTaskJVMCancellationPercentage < heapTaskCancellationDecreaseMinThreashold; + // shard level thresholds boolean increaseJVMThresholdMetByShard = - maxHeapBelowIncreaseThreshold - && (avgShardJVMCancellationPercentage - > heapShardCancellationIncreaseMaxThreshold); + maxHeapBelowIncreaseThreshold && shardHeapCancellationPercentageAboveThreshold; boolean decreaseJVMThresholdMetByShard = - minHeapAboveDecreaseThreshold - && (avgShardJVMCancellationPercentage - < heapShardCancellationDecreaseMinThreashold); + minHeapAboveDecreaseThreshold && shardHeapCancellationPercentageBelowThreshold; // task level thresholds boolean increaseJVMThresholdMetByTask = - maxHeapBelowIncreaseThreshold - && (avgTaskJVMCancellationPercentage - > heapTaskCancellationIncreaseMaxThreshold); + maxHeapBelowIncreaseThreshold && taskHeapCancellationPercentageAboveThreshold; boolean decreaseJVMThresholdMetByTask = - minHeapAboveDecreaseThreshold - && (avgTaskJVMCancellationPercentage - < heapTaskCancellationDecreaseMinThreashold); + minHeapAboveDecreaseThreshold && taskHeapCancellationPercentageBelowThreshold; // Generate a flow unit with an Unhealthy ResourceContext LOG.info( @@ -274,8 +254,8 @@ public ResourceFlowUnit operate() { if (increaseJVMThresholdMetByShard || decreaseJVMThresholdMetByShard) { context = new ResourceContext(Resources.State.UNHEALTHY); - // add an additional resource with metadata: shard-level HotResourceSummary resourceSummary; + // metadata fields indicate the reason for Unhealthy Resource Unit if (increaseJVMThresholdMetByShard) { resourceSummary = new HotResourceSummary( @@ -295,14 +275,12 @@ public ResourceFlowUnit operate() { } nodeSummary.appendNestedSummary(resourceSummary); - return new ResourceFlowUnit<>( currentTimeMillis, context, nodeSummary, !instanceDetails.getIsClusterManager()); } else if (increaseJVMThresholdMetByTask || decreaseJVMThresholdMetByTask) { - // Generate a flow unit with an Unhealthy ResourceContext LOG.info( "Increase/Decrease Condition Meet for Task, maxHeapUsagePercentage: {} is less than threshold: {}, avgShardJVMCancellationPercentage: {} is bigger than heapShardCancellationIncreaseMaxThreshold: {}", maxHeapUsagePercentage, @@ -311,7 +289,6 @@ public ResourceFlowUnit operate() { heapTaskCancellationIncreaseMaxThreshold); context = new ResourceContext(Resources.State.UNHEALTHY); - // add an additional resource with metadata: task-level HotResourceSummary resourceSummary; if (increaseJVMThresholdMetByTask) { resourceSummary = @@ -348,8 +325,8 @@ public ResourceFlowUnit operate() { } } else { - // return healthy state when the currentIterationNumber does not meet rcaPeriod - LOG.info("Empty Healthy FlowUnit returned for SearchbackPressureRCA"); + // Return Empty ResourceFlowUnit if none of the thresholds is met + LOG.info("Empty FlowUnit returned for SearchbackPressureRCA"); currentTimeMillis = System.currentTimeMillis(); return new ResourceFlowUnit<>(currentTimeMillis); } @@ -479,8 +456,7 @@ private double getMetric(M metric, Field field, Strin if (!flowUnit.isEmpty()) { double metricResponse = readDataFromSqlResult(flowUnit.getData(), field, fieldName, MetricsDB.MAX); - // // print out the metricResponse - // LOG.info("Searchbp metricResponse is: {}", metricResponse); + LOG.info("Searchbp metricResponse is: {}", metricResponse); if (!Double.isNaN(metricResponse) && metricResponse >= 0.0) { response = metricResponse; } @@ -497,7 +473,6 @@ private double getMetric(M metric, Field field, Strin */ @Override public void readRcaConf(RcaConf conf) { - LOG.info("SearchBackPressureRCA readRcaConf() intiatilized"); final SearchBackPressureRcaConfig config = conf.getSearchBackPressureRcaConfig(); // threshold read from config file @@ -515,4 +490,32 @@ public void readRcaConf(RcaConf conf) { this.heapTaskCancellationDecreaseMinThreashold = config.getMinTaskHeapCancellationPercentageThreshold(); } + + /* + * Update Stats for all Sliding Windows + */ + private void updateAllSlidingWindows( + SearchBackPressureRCAMetric searchBackPressureRCAMetric, long currentTimeMillis) { + double prevheapUsagePercentage = searchBackPressureRCAMetric.getHeapUsagePercent(); + if (!Double.isNaN(prevheapUsagePercentage)) { + minHeapUsageSlidingWindow.next( + new SlidingWindowData(currentTimeMillis, prevheapUsagePercentage)); + maxHeapUsageSlidingWindow.next( + new SlidingWindowData(currentTimeMillis, prevheapUsagePercentage)); + } + + double shardJVMCancellationPercentage = + searchBackPressureRCAMetric.getShardJVMCancellationPercent(); + if (!Double.isNaN(shardJVMCancellationPercentage)) { + shardJVMCancellationSlidingWindow.next( + new SlidingWindowData(currentTimeMillis, shardJVMCancellationPercentage)); + } + + double taskJVMCancellationPercentage = + searchBackPressureRCAMetric.getTaskJVMCancellationPercent(); + if (!Double.isNaN(taskJVMCancellationPercentage)) { + taskJVMCancellationSlidingWindow.next( + new SlidingWindowData(currentTimeMillis, taskJVMCancellationPercentage)); + } + } } From 8996edde266f4f5ace4c194ee09907e9c20bcbd7 Mon Sep 17 00:00:00 2001 From: CoderJeffrey Date: Mon, 10 Jul 2023 10:21:41 -0700 Subject: [PATCH 28/73] Add new ActionPojo for Searchbp (Signed-off-by: Jeffrey Liu ujeffliu@amazon.com) Signed-off-by: CoderJeffrey --- .../actions/SearchBackPressureAction.java | 38 +++++++++++++++++++ 1 file changed, 38 insertions(+) create mode 100644 src/main/java/org/opensearch/performanceanalyzer/decisionmaker/actions/SearchBackPressureAction.java diff --git a/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/actions/SearchBackPressureAction.java b/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/actions/SearchBackPressureAction.java new file mode 100644 index 000000000..544820728 --- /dev/null +++ b/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/actions/SearchBackPressureAction.java @@ -0,0 +1,38 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ + +package org.opensearch.performanceanalyzer.decisionmaker.actions; + +import com.google.gson.JsonObject; +import com.google.gson.JsonParser; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.concurrent.TimeUnit; +import java.util.stream.Collectors; +import javax.annotation.Nonnull; +import org.opensearch.performanceanalyzer.AppContext; +import org.opensearch.performanceanalyzer.rca.framework.util.InstanceDetails.Id; +import org.opensearch.performanceanalyzer.rca.framework.util.InstanceDetails.Ip; +import org.opensearch.performanceanalyzer.rca.store.rca.cluster.NodeKey; +import org.apache.logging.log4j.LogManager; + +public class SearchBackPressureAction extends SuppressibleAction { + private static final Logger LOG = LogManager.getLogger(SearchBackPressureAction.class); + public static final String NAME = "SearchBackPressureAction"; + private static final String ID_KEY = "Id"; + private static final String IP_KEY = "Ip"; + private final NodeKey node; + + /* TO DO: Discuss the default cool off period for SearchBackPressureAction + * Time to wait since last recommendation, before suggesting this action again + */ + private static final long DEFAULT_COOL_OFF_PERIOD_IN_MILLIS = TimeUnit.DAYS.toMillis(3); + + // step size in percent + + +} + From 2aab197d902ad53911fc74dcf651787582640119 Mon Sep 17 00:00:00 2001 From: CoderJeffrey Date: Mon, 10 Jul 2023 11:43:45 -0700 Subject: [PATCH 29/73] Add new ActionPojo for Searchbp#2 (Signed-off-by: Jeffrey Liu ujeffliu@amazon.com) Signed-off-by: CoderJeffrey --- .../actions/SearchBackPressureAction.java | 174 ++++++++++++++++-- 1 file changed, 163 insertions(+), 11 deletions(-) diff --git a/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/actions/SearchBackPressureAction.java b/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/actions/SearchBackPressureAction.java index 544820728..60b78f606 100644 --- a/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/actions/SearchBackPressureAction.java +++ b/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/actions/SearchBackPressureAction.java @@ -5,34 +5,186 @@ package org.opensearch.performanceanalyzer.decisionmaker.actions; -import com.google.gson.JsonObject; -import com.google.gson.JsonParser; + +import com.google.gson.Gson; +import com.google.gson.GsonBuilder; +import com.google.gson.annotations.SerializedName; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.concurrent.TimeUnit; import java.util.stream.Collectors; -import javax.annotation.Nonnull; +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; import org.opensearch.performanceanalyzer.AppContext; -import org.opensearch.performanceanalyzer.rca.framework.util.InstanceDetails.Id; -import org.opensearch.performanceanalyzer.rca.framework.util.InstanceDetails.Ip; import org.opensearch.performanceanalyzer.rca.store.rca.cluster.NodeKey; -import org.apache.logging.log4j.LogManager; public class SearchBackPressureAction extends SuppressibleAction { private static final Logger LOG = LogManager.getLogger(SearchBackPressureAction.class); public static final String NAME = "SearchBackPressureAction"; private static final String ID_KEY = "Id"; private static final String IP_KEY = "Ip"; - private final NodeKey node; - /* TO DO: Discuss the default cool off period for SearchBackPressureAction - * Time to wait since last recommendation, before suggesting this action again + /* placeholder for dummy impactVector + * TODO: Remove + */ + private static final ImpactVector NO_IMPACT = new ImpactVector(); + + /* TO DO: Discuss the default cool off period for SearchBackPressureAction + * Time to wait since last recommendation, before suggesting this action again */ private static final long DEFAULT_COOL_OFF_PERIOD_IN_MILLIS = TimeUnit.DAYS.toMillis(3); - // step size in percent + // step size in percent + /* From Config Per Diumension Type + * Dimension should include all the settings dimension (e.g. node_duress.cpu_threshold, search_heap_threshold) + * Step Size in percentage + * NOT to Node Level but for whole service (so all data node instances) + * canUpdate means whether the action should be emitted + * + */ + private final String searchbpDimension; + private final double desiredValue; + private final double currentValue; + private boolean canUpdate; + private long coolOffPeriodInMillis; + + public SearchBackPressureAction( + final AppContext appContext, + final boolean canUpdate, + final long coolOffPeriodInMillis, + final String searchbpDimension, + final double desiredValue, + final double currentValue) { + super(appContext); + this.canUpdate = canUpdate; + this.coolOffPeriodInMillis = coolOffPeriodInMillis; + this.searchbpDimension = searchbpDimension; + this.desiredValue = desiredValue; + this.currentValue = currentValue; + } + + @Override + public String name() { + return NAME; + } + + @Override + public boolean canUpdate() { + return canUpdate; + } + + @Override + public long coolOffPeriodInMillis() { + return coolOffPeriodInMillis; + } + + @Override + public List impactedNodes() { + // all nodes are impacted by this change + return appContext.getDataNodeInstances().stream() + .map(NodeKey::new) + .collect(Collectors.toList()); + } + + /* TO DO: Discuss the impact of SearchBackPressureAction */ + @Override + public Map impact() { + Map impact = new HashMap<>(); + for (NodeKey key : impactedNodes()) { + /* TODO: Impact Logic for SearchBackPressureAction */ + ImpactVector impactVector = new ImpactVector(); + impact.put(key, NO_IMPACT); + } + return impact; + } + + public double getCurrentValue() { + return this.currentValue; + } + + public double getDesiredValue() { + return this.desiredValue; + } + + @Override + public String summary() { + Summary summary = + new Summary( + searchbpDimension, + desiredValue, + currentValue, + DEFAULT_COOL_OFF_PERIOD_IN_MILLIS, + canUpdate); + return summary.toJson(); + } + + /* Write Static Class Summary to conver the Action POJO to JSON Object + * Key fields to be included + * 1. Dimension (name) of the setting to be modified + * 2. CurrentValue of the setting to be modified + * 2. DesiredValue of the setting to be modified + * 3. CoolOffPeriodInMillis + * 4. canUpdate (whether the action should be emitted) + */ + public static class Summary { + public static final String SEARCHBP_SETTING_DIMENSION = "searchbp_setting_dimension"; + public static final String DESIRED_VALUE = "desiredValue"; + public static final String CURRENT_VALUE = "currentValue"; + public static final String COOL_OFF_PERIOD = "coolOffPeriodInMillis"; + public static final String CAN_UPDATE = "canUpdate"; + + @SerializedName(value = SEARCHBP_SETTING_DIMENSION) + private String searchbpSettingDimension; + + @SerializedName(value = DESIRED_VALUE) + private double desiredValue; + + @SerializedName(value = CURRENT_VALUE) + private double currentValue; + + @SerializedName(value = COOL_OFF_PERIOD) + private long coolOffPeriodInMillis; + + @SerializedName(value = CAN_UPDATE) + private boolean canUpdate; + + public Summary( + String searchbpSettingDimension, + double desiredValue, + double currentValue, + long coolOffPeriodInMillis, + boolean canUpdate) { + this.searchbpSettingDimension = searchbpSettingDimension; + this.desiredValue = desiredValue; + this.currentValue = currentValue; + this.coolOffPeriodInMillis = coolOffPeriodInMillis; + this.canUpdate = canUpdate; + } + + public String getSearchbpSettingDimension() { + return this.searchbpSettingDimension; + } + + public double getCurrentValue() { + return this.currentValue; + } + + public double getDesiredValue() { + return this.desiredValue; + } + + public long getCoolOffPeriodInMillis() { + return coolOffPeriodInMillis; + } + public boolean getCanUpdate() { + return canUpdate; + } + public String toJson() { + Gson gson = new GsonBuilder().disableHtmlEscaping().create(); + return gson.toJson(this); + } + } } - From e51da5a60a52ee10b7bc0747b74a6bdb3837a8c4 Mon Sep 17 00:00:00 2001 From: CoderJeffrey Date: Mon, 10 Jul 2023 14:56:02 -0700 Subject: [PATCH 30/73] Update SearchBackPressureAction (Signed-off-by: Jeffrey Liu ujeffliu@amazon.com) Signed-off-by: CoderJeffrey --- .../actions/SearchBackPressureAction.java | 69 ++++++++++++++++--- 1 file changed, 59 insertions(+), 10 deletions(-) diff --git a/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/actions/SearchBackPressureAction.java b/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/actions/SearchBackPressureAction.java index 60b78f606..226e35183 100644 --- a/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/actions/SearchBackPressureAction.java +++ b/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/actions/SearchBackPressureAction.java @@ -32,8 +32,9 @@ public class SearchBackPressureAction extends SuppressibleAction { /* TO DO: Discuss the default cool off period for SearchBackPressureAction * Time to wait since last recommendation, before suggesting this action again + * Needs the action config to have the cool off period for all dimension */ - private static final long DEFAULT_COOL_OFF_PERIOD_IN_MILLIS = TimeUnit.DAYS.toMillis(3); + private static final long DEFAULT_COOL_OFF_PERIOD_IN_MILLIS = TimeUnit.HOURS.toMillis(1); // step size in percent /* From Config Per Diumension Type @@ -41,7 +42,6 @@ public class SearchBackPressureAction extends SuppressibleAction { * Step Size in percentage * NOT to Node Level but for whole service (so all data node instances) * canUpdate means whether the action should be emitted - * */ private final String searchbpDimension; private final double desiredValue; @@ -87,13 +87,16 @@ public List impactedNodes() { .collect(Collectors.toList()); } - /* TO DO: Discuss the impact of SearchBackPressureAction */ + /* TO DO: Discuss the impact of SearchBackPressureAction + * since our action only modify the threhsold settings of Search Back Pressure Service instead of actual Resource + * No Impact should be put as the Impact Vector for this action so other actions would not be affected by Searchbp-specific actions + */ @Override public Map impact() { Map impact = new HashMap<>(); for (NodeKey key : impactedNodes()) { /* TODO: Impact Logic for SearchBackPressureAction */ - ImpactVector impactVector = new ImpactVector(); + // ImpactVector impactVector = new ImpactVector(); impact.put(key, NO_IMPACT); } return impact; @@ -119,13 +122,59 @@ public String summary() { return summary.toJson(); } - /* Write Static Class Summary to conver the Action POJO to JSON Object + public static final class Builder { + private final AppContext appContext; + private final String searchbpDimension; + private Double currentValue; + private Double desiredValue; + private long coolOffPeriodInMillis; + + private Builder( + final AppContext appContext, + final String searchbp_dimension, + final long coolOffPeriodInMillis) { + this.appContext = appContext; + this.searchbpDimension = searchbp_dimension; + this.coolOffPeriodInMillis = coolOffPeriodInMillis; + } + + public Builder currentValue(Double currentValue) { + this.currentValue = currentValue; + return this; + } + + public Builder desiredValue(Double desiredValue) { + this.desiredValue = desiredValue; + return this; + } + + public Builder coolOffPeriodInMillis(long coolOffPeriodInMillis) { + this.coolOffPeriodInMillis = coolOffPeriodInMillis; + return this; + } + + public SearchBackPressureAction build() { + Boolean canUpdate = false; + /* + * if desiredValue is between 0 and 100 then canUpdate is true + * since desiredValue is valid */ + if (desiredValue != null) { + canUpdate = ((desiredValue >= 0) && (desiredValue <= 100)); + } + + return new SearchBackPressureAction( + appContext, canUpdate, coolOffPeriodInMillis, searchbpDimension, desiredValue, currentValue); + } + } + + + /* Write Static Class Summary to conver the Searchbp Action POJO to JSON Object * Key fields to be included - * 1. Dimension (name) of the setting to be modified - * 2. CurrentValue of the setting to be modified - * 2. DesiredValue of the setting to be modified - * 3. CoolOffPeriodInMillis - * 4. canUpdate (whether the action should be emitted) + * 1. Dimension (name) of the Searchbp setting to be modified + * 2. CurrentValue of the setting + * 2. DesiredValue of the setting + * 3. CoolOffPeriodInMillis for the action + * 4. canUpdate (whether the action should be emitted) */ public static class Summary { public static final String SEARCHBP_SETTING_DIMENSION = "searchbp_setting_dimension"; From 6eb937a0a629f986185e3dcb8b9d0852fdd5fe16 Mon Sep 17 00:00:00 2001 From: CoderJeffrey Date: Mon, 10 Jul 2023 16:16:14 -0700 Subject: [PATCH 31/73] Add Searchbp Decider (Signed-off-by: Jeffrey Liu ujeffliu@amazon.com) Signed-off-by: CoderJeffrey --- .../SearchBackPressureDecider.java | 80 +++++++++++++++++++ 1 file changed, 80 insertions(+) create mode 100644 src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/searchbackpressure/SearchBackPressureDecider.java diff --git a/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/searchbackpressure/SearchBackPressureDecider.java b/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/searchbackpressure/SearchBackPressureDecider.java new file mode 100644 index 000000000..9e661a21d --- /dev/null +++ b/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/searchbackpressure/SearchBackPressureDecider.java @@ -0,0 +1,80 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ + +package org.opensearch.performanceanalyzer.decisionmaker.deciders.searchbackpressure; + + +import java.util.List; +import org.opensearch.performanceanalyzer.AppContext; +import org.opensearch.performanceanalyzer.decisionmaker.actions.Action; +import org.opensearch.performanceanalyzer.decisionmaker.deciders.Decider; +import org.opensearch.performanceanalyzer.decisionmaker.deciders.Decision; +import org.opensearch.performanceanalyzer.decisionmaker.deciders.jvm.old_gen.OldGenDecisionPolicy; +import org.opensearch.performanceanalyzer.decisionmaker.deciders.jvm.sizing.HeapSizeIncreasePolicy; +import org.opensearch.performanceanalyzer.rca.framework.core.RcaConf; +import org.opensearch.performanceanalyzer.rca.store.rca.HighHeapUsageClusterRca; +import org.opensearch.performanceanalyzer.rca.store.rca.jvmsizing.LargeHeapClusterRca; +import org.opensearch.performanceanalyzer.rca.store.rca.searchbackpressure.SearchBackPressureClusterRCA; +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; + +/** decider to change the dynamic settings of SearchBackPressure In-flight Cancellation*/ +public class SearchBackPressureDecider extends Decider { + private static final Logger LOG = LogManager.getLogger(SearchBackPressureDecider.class); + public static final String NAME = "SearchBackPressureDecider"; + + /* TO ADD: SearchBackPressureDecider should have SeachBackPressurePolicy able to evaluate the search back pressure */ + + private int currentIteration = 0; + private SearchBackPressureClusterRCA searchBackPressureClusterRCA; + + public SearchBackPressureDecider( + long evalIntervalSeconds, + int decisionFrequency, + SearchBackPressureClusterRCA searchBackPressureClusterRCA + ){ + super(evalIntervalSeconds, decisionFrequency, searchBackPressureClusterRCA); + this.searchBackPressureClusterRCA = searchBackPressureClusterRCA; + LOG.info("SearchBackPressureDecider created"); + } + + @Override + public String name() { + return NAME; + } + + @Override + public Decision operate() { + LOG.info("SearchBackPressureDecider operate() with currentIteration: {}", currentIteration); + + Decision decision = new Decision(System.currentTimeMillis(), NAME); + currentIteration += 1; + if (currentIteration < decisionFrequency) { + return decision; + } + + currentIteration = 0; + + // SearchBackPressure Policy is always accepted + // List searchBackPressureActions = seart.evaluate(); + // oldGenPolicyActions.forEach(decision::addAction); + return decision; + } + + /* Read RCA Config to fill the dynamic threshold settings for the SearchBackPressure Service */ + @Override + public void readRcaConf(RcaConf conf) { + super.readRcaConf(conf); + // oldGenDecisionPolicy.setRcaConf(conf); + } + + /* Set AppContext for SearchBackPressurePolicy */ + @Override + public void setAppContext(final AppContext appContext) { + super.setAppContext(appContext); + // oldGenDecisionPolicy.setAppContext(appContext); + } + +} \ No newline at end of file From d3177159c5af6d927e160d68a2ce8d5a85f6f3c5 Mon Sep 17 00:00:00 2001 From: CoderJeffrey Date: Mon, 10 Jul 2023 16:42:30 -0700 Subject: [PATCH 32/73] Add Searchbp Policy and Config (Signed-off-by: Jeffrey Liu ujeffliu@amazon.com) Signed-off-by: CoderJeffrey --- .../actions/SearchBackPressureAction.java | 18 ++++--- .../SearchBackPressurePolicyConfig.java | 37 +++++++++++++++ .../SearchBackPressureDecider.java | 24 ++++------ .../SearchBackPressurePolicy.java | 47 +++++++++++++++++++ 4 files changed, 103 insertions(+), 23 deletions(-) create mode 100644 src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/configs/searchbackpressure/SearchBackPressurePolicyConfig.java create mode 100644 src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/searchbackpressure/SearchBackPressurePolicy.java diff --git a/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/actions/SearchBackPressureAction.java b/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/actions/SearchBackPressureAction.java index 226e35183..0449d915d 100644 --- a/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/actions/SearchBackPressureAction.java +++ b/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/actions/SearchBackPressureAction.java @@ -87,10 +87,10 @@ public List impactedNodes() { .collect(Collectors.toList()); } - /* TO DO: Discuss the impact of SearchBackPressureAction + /* TO DO: Discuss the impact of SearchBackPressureAction * since our action only modify the threhsold settings of Search Back Pressure Service instead of actual Resource * No Impact should be put as the Impact Vector for this action so other actions would not be affected by Searchbp-specific actions - */ + */ @Override public Map impact() { Map impact = new HashMap<>(); @@ -155,7 +155,7 @@ public Builder coolOffPeriodInMillis(long coolOffPeriodInMillis) { public SearchBackPressureAction build() { Boolean canUpdate = false; - /* + /* * if desiredValue is between 0 and 100 then canUpdate is true * since desiredValue is valid */ if (desiredValue != null) { @@ -163,16 +163,20 @@ public SearchBackPressureAction build() { } return new SearchBackPressureAction( - appContext, canUpdate, coolOffPeriodInMillis, searchbpDimension, desiredValue, currentValue); + appContext, + canUpdate, + coolOffPeriodInMillis, + searchbpDimension, + desiredValue, + currentValue); } } - /* Write Static Class Summary to conver the Searchbp Action POJO to JSON Object * Key fields to be included * 1. Dimension (name) of the Searchbp setting to be modified - * 2. CurrentValue of the setting - * 2. DesiredValue of the setting + * 2. CurrentValue of the setting + * 2. DesiredValue of the setting * 3. CoolOffPeriodInMillis for the action * 4. canUpdate (whether the action should be emitted) */ diff --git a/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/configs/searchbackpressure/SearchBackPressurePolicyConfig.java b/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/configs/searchbackpressure/SearchBackPressurePolicyConfig.java new file mode 100644 index 000000000..62ab7c4a6 --- /dev/null +++ b/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/configs/searchbackpressure/SearchBackPressurePolicyConfig.java @@ -0,0 +1,37 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ + +package org.opensearch.performanceanalyzer.decisionmaker.deciders.configs.jvm.young_gen; + + +import java.util.concurrent.TimeUnit; +import org.opensearch.performanceanalyzer.decisionmaker.deciders.searchbackpressure.SearchBackPressurePolicy; +import org.opensearch.performanceanalyzer.rca.framework.core.Config; +import org.opensearch.performanceanalyzer.rca.framework.core.NestedConfig; + +/** + * Configures various thresholds for the {@link SearchBackPressurePolicy} + * + *

The config follows the format below "decider-config-settings": { + * "search-back-pressure-policy-config": + * { + * "enabled": true, // whether the serch-back-pressure-policy should be enabled + * "hour-threshold": 30, // threshold for hourly received unhealthy cluster level rca flow units, if above, then the below thresholds should be modified + * "threshold_count": 2, // how many thresholds to be changed, in this case search-heap-threshold and search-task-heap-threshold + * "search_task_heap_stepsize_in_percentage": 5, + * "search_task_stepsize_in_percentage": 0.5" + * } + * } + * Explanation of thresholds that are being configured and modified based on current RCA flowunits: + * search_task_heap_stepsize_in_percentage: Defines the step size to change heap usage threshold (in percentage). + * for the sum of heap usages across all search tasks before in-flight cancellation is applied. + * search_task_stepsize_in_percentage: Defines the step size to change + * heap usage threshold (in percentage) for an individual task before it is considered for cancellation. + * + **/ + +public class SearchBackPressurePolicyConfig { + +} diff --git a/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/searchbackpressure/SearchBackPressureDecider.java b/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/searchbackpressure/SearchBackPressureDecider.java index 9e661a21d..1088b04ed 100644 --- a/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/searchbackpressure/SearchBackPressureDecider.java +++ b/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/searchbackpressure/SearchBackPressureDecider.java @@ -6,21 +6,15 @@ package org.opensearch.performanceanalyzer.decisionmaker.deciders.searchbackpressure; -import java.util.List; +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; import org.opensearch.performanceanalyzer.AppContext; -import org.opensearch.performanceanalyzer.decisionmaker.actions.Action; import org.opensearch.performanceanalyzer.decisionmaker.deciders.Decider; import org.opensearch.performanceanalyzer.decisionmaker.deciders.Decision; -import org.opensearch.performanceanalyzer.decisionmaker.deciders.jvm.old_gen.OldGenDecisionPolicy; -import org.opensearch.performanceanalyzer.decisionmaker.deciders.jvm.sizing.HeapSizeIncreasePolicy; import org.opensearch.performanceanalyzer.rca.framework.core.RcaConf; -import org.opensearch.performanceanalyzer.rca.store.rca.HighHeapUsageClusterRca; -import org.opensearch.performanceanalyzer.rca.store.rca.jvmsizing.LargeHeapClusterRca; import org.opensearch.performanceanalyzer.rca.store.rca.searchbackpressure.SearchBackPressureClusterRCA; -import org.apache.logging.log4j.LogManager; -import org.apache.logging.log4j.Logger; -/** decider to change the dynamic settings of SearchBackPressure In-flight Cancellation*/ +/** decider to change the dynamic settings of SearchBackPressure In-flight Cancellation */ public class SearchBackPressureDecider extends Decider { private static final Logger LOG = LogManager.getLogger(SearchBackPressureDecider.class); public static final String NAME = "SearchBackPressureDecider"; @@ -31,11 +25,10 @@ public class SearchBackPressureDecider extends Decider { private SearchBackPressureClusterRCA searchBackPressureClusterRCA; public SearchBackPressureDecider( - long evalIntervalSeconds, - int decisionFrequency, - SearchBackPressureClusterRCA searchBackPressureClusterRCA - ){ - super(evalIntervalSeconds, decisionFrequency, searchBackPressureClusterRCA); + long evalIntervalSeconds, + int decisionFrequency, + SearchBackPressureClusterRCA searchBackPressureClusterRCA) { + super(evalIntervalSeconds, decisionFrequency); this.searchBackPressureClusterRCA = searchBackPressureClusterRCA; LOG.info("SearchBackPressureDecider created"); } @@ -76,5 +69,4 @@ public void setAppContext(final AppContext appContext) { super.setAppContext(appContext); // oldGenDecisionPolicy.setAppContext(appContext); } - -} \ No newline at end of file +} diff --git a/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/searchbackpressure/SearchBackPressurePolicy.java b/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/searchbackpressure/SearchBackPressurePolicy.java new file mode 100644 index 000000000..4c73747e3 --- /dev/null +++ b/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/searchbackpressure/SearchBackPressurePolicy.java @@ -0,0 +1,47 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ + +package org.opensearch.performanceanalyzer.decisionmaker.deciders.jvm; + +import static org.opensearch.performanceanalyzer.rca.framework.api.summaries.ResourceUtil.FULL_GC_PAUSE_TIME; +import static org.opensearch.performanceanalyzer.rca.framework.api.summaries.ResourceUtil.MINOR_GC_PAUSE_TIME; +import static org.opensearch.performanceanalyzer.rca.framework.api.summaries.ResourceUtil.OLD_GEN_HEAP_USAGE; +import static org.opensearch.performanceanalyzer.rca.framework.api.summaries.ResourceUtil.YOUNG_GEN_PROMOTION_RATE; + +import com.google.common.annotations.VisibleForTesting; +import com.google.common.collect.Lists; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.util.ArrayList; +import java.util.List; +import java.util.concurrent.TimeUnit; +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; +import org.opensearch.performanceanalyzer.AppContext; +import org.opensearch.performanceanalyzer.decisionmaker.actions.Action; +import org.opensearch.performanceanalyzer.decisionmaker.actions.JvmGenAction; +import org.opensearch.performanceanalyzer.decisionmaker.deciders.DecisionPolicy; +import org.opensearch.performanceanalyzer.decisionmaker.deciders.configs.jvm.young_gen.JvmGenTuningPolicyConfig; +import org.opensearch.performanceanalyzer.decisionmaker.deciders.jvm.JvmGenTuningPolicy; +import org.opensearch.performanceanalyzer.grpc.Resource; +import org.opensearch.performanceanalyzer.rca.framework.api.aggregators.BucketizedSlidingWindowConfig; +import org.opensearch.performanceanalyzer.rca.framework.api.flow_units.ResourceFlowUnit; +import org.opensearch.performanceanalyzer.rca.framework.api.summaries.HotClusterSummary; +import org.opensearch.performanceanalyzer.rca.framework.api.summaries.HotNodeSummary; +import org.opensearch.performanceanalyzer.rca.framework.api.summaries.HotResourceSummary; +import org.opensearch.performanceanalyzer.rca.framework.api.summaries.ResourceUtil; +import org.opensearch.performanceanalyzer.rca.framework.core.RcaConf; +import org.opensearch.performanceanalyzer.rca.framework.util.RcaConsts; +import org.opensearch.performanceanalyzer.rca.store.collector.NodeConfigCache; +import org.opensearch.performanceanalyzer.rca.store.rca.HighHeapUsageClusterRca; +import org.opensearch.performanceanalyzer.rca.store.rca.cluster.NodeKey; + +/** + * Decides if the SearchBackPressure threshold should be modified + * suggests actions to take to achieve improved performance. + */ +public class SearchBackPressurePolicy implements DecisionPolicy { + +} From 8eab98a9ca62573028c543869c5d2ba379b601d5 Mon Sep 17 00:00:00 2001 From: CoderJeffrey Date: Mon, 10 Jul 2023 16:55:31 -0700 Subject: [PATCH 33/73] Add dummy Searchbp Policy and Config in OpenSearchAnalysis Graph (Signed-off-by: Jeffrey Liu ujeffliu@amazon.com) Signed-off-by: CoderJeffrey --- .../SearchBackPressurePolicyConfig.java | 38 +++++++----------- .../SearchBackPressurePolicy.java | 40 ++++--------------- .../rca/store/OpenSearchAnalysisGraph.java | 18 +++++++-- 3 files changed, 37 insertions(+), 59 deletions(-) diff --git a/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/configs/searchbackpressure/SearchBackPressurePolicyConfig.java b/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/configs/searchbackpressure/SearchBackPressurePolicyConfig.java index 62ab7c4a6..d7156ed34 100644 --- a/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/configs/searchbackpressure/SearchBackPressurePolicyConfig.java +++ b/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/configs/searchbackpressure/SearchBackPressurePolicyConfig.java @@ -3,35 +3,25 @@ * SPDX-License-Identifier: Apache-2.0 */ -package org.opensearch.performanceanalyzer.decisionmaker.deciders.configs.jvm.young_gen; +package org.opensearch.performanceanalyzer.decisionmaker.deciders.configs.searchbackpressure; -import java.util.concurrent.TimeUnit; import org.opensearch.performanceanalyzer.decisionmaker.deciders.searchbackpressure.SearchBackPressurePolicy; -import org.opensearch.performanceanalyzer.rca.framework.core.Config; -import org.opensearch.performanceanalyzer.rca.framework.core.NestedConfig; /** * Configures various thresholds for the {@link SearchBackPressurePolicy} * *

The config follows the format below "decider-config-settings": { - * "search-back-pressure-policy-config": - * { - * "enabled": true, // whether the serch-back-pressure-policy should be enabled - * "hour-threshold": 30, // threshold for hourly received unhealthy cluster level rca flow units, if above, then the below thresholds should be modified - * "threshold_count": 2, // how many thresholds to be changed, in this case search-heap-threshold and search-task-heap-threshold - * "search_task_heap_stepsize_in_percentage": 5, - * "search_task_stepsize_in_percentage": 0.5" - * } - * } - * Explanation of thresholds that are being configured and modified based on current RCA flowunits: - * search_task_heap_stepsize_in_percentage: Defines the step size to change heap usage threshold (in percentage). - * for the sum of heap usages across all search tasks before in-flight cancellation is applied. - * search_task_stepsize_in_percentage: Defines the step size to change - * heap usage threshold (in percentage) for an individual task before it is considered for cancellation. - * - **/ - -public class SearchBackPressurePolicyConfig { - -} + * "search-back-pressure-policy-config": { "enabled": true, // whether the + * serch-back-pressure-policy should be enabled "hour-threshold": 30, // threshold for hourly + * received unhealthy cluster level rca flow units, if above, then the below thresholds should be + * modified "threshold_count": 2, // how many thresholds to be changed, in this case + * search-heap-threshold and search-task-heap-threshold "search_task_heap_stepsize_in_percentage": + * 5, "search_task_stepsize_in_percentage": 0.5" } } Explanation of thresholds that are being + * configured and modified based on current RCA flowunits: search_task_heap_stepsize_in_percentage: + * Defines the step size to change heap usage threshold (in percentage). for the sum of heap usages + * across all search tasks before in-flight cancellation is applied. + * search_task_stepsize_in_percentage: Defines the step size to change heap usage threshold (in + * percentage) for an individual task before it is considered for cancellation. + */ +public class SearchBackPressurePolicyConfig {} diff --git a/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/searchbackpressure/SearchBackPressurePolicy.java b/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/searchbackpressure/SearchBackPressurePolicy.java index 4c73747e3..4e4117ecb 100644 --- a/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/searchbackpressure/SearchBackPressurePolicy.java +++ b/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/searchbackpressure/SearchBackPressurePolicy.java @@ -3,45 +3,21 @@ * SPDX-License-Identifier: Apache-2.0 */ -package org.opensearch.performanceanalyzer.decisionmaker.deciders.jvm; +package org.opensearch.performanceanalyzer.decisionmaker.deciders.searchbackpressure; -import static org.opensearch.performanceanalyzer.rca.framework.api.summaries.ResourceUtil.FULL_GC_PAUSE_TIME; -import static org.opensearch.performanceanalyzer.rca.framework.api.summaries.ResourceUtil.MINOR_GC_PAUSE_TIME; -import static org.opensearch.performanceanalyzer.rca.framework.api.summaries.ResourceUtil.OLD_GEN_HEAP_USAGE; -import static org.opensearch.performanceanalyzer.rca.framework.api.summaries.ResourceUtil.YOUNG_GEN_PROMOTION_RATE; -import com.google.common.annotations.VisibleForTesting; -import com.google.common.collect.Lists; -import java.nio.file.Path; -import java.nio.file.Paths; -import java.util.ArrayList; import java.util.List; -import java.util.concurrent.TimeUnit; -import org.apache.logging.log4j.LogManager; -import org.apache.logging.log4j.Logger; -import org.opensearch.performanceanalyzer.AppContext; import org.opensearch.performanceanalyzer.decisionmaker.actions.Action; -import org.opensearch.performanceanalyzer.decisionmaker.actions.JvmGenAction; import org.opensearch.performanceanalyzer.decisionmaker.deciders.DecisionPolicy; -import org.opensearch.performanceanalyzer.decisionmaker.deciders.configs.jvm.young_gen.JvmGenTuningPolicyConfig; -import org.opensearch.performanceanalyzer.decisionmaker.deciders.jvm.JvmGenTuningPolicy; -import org.opensearch.performanceanalyzer.grpc.Resource; -import org.opensearch.performanceanalyzer.rca.framework.api.aggregators.BucketizedSlidingWindowConfig; -import org.opensearch.performanceanalyzer.rca.framework.api.flow_units.ResourceFlowUnit; -import org.opensearch.performanceanalyzer.rca.framework.api.summaries.HotClusterSummary; -import org.opensearch.performanceanalyzer.rca.framework.api.summaries.HotNodeSummary; -import org.opensearch.performanceanalyzer.rca.framework.api.summaries.HotResourceSummary; -import org.opensearch.performanceanalyzer.rca.framework.api.summaries.ResourceUtil; -import org.opensearch.performanceanalyzer.rca.framework.core.RcaConf; -import org.opensearch.performanceanalyzer.rca.framework.util.RcaConsts; -import org.opensearch.performanceanalyzer.rca.store.collector.NodeConfigCache; -import org.opensearch.performanceanalyzer.rca.store.rca.HighHeapUsageClusterRca; -import org.opensearch.performanceanalyzer.rca.store.rca.cluster.NodeKey; /** - * Decides if the SearchBackPressure threshold should be modified - * suggests actions to take to achieve improved performance. + * Decides if the SearchBackPressure threshold should be modified suggests actions to take to + * achieve improved performance. */ public class SearchBackPressurePolicy implements DecisionPolicy { - + + @Override + public List evaluate() { + return null; + } } diff --git a/src/main/java/org/opensearch/performanceanalyzer/rca/store/OpenSearchAnalysisGraph.java b/src/main/java/org/opensearch/performanceanalyzer/rca/store/OpenSearchAnalysisGraph.java index 970db2ace..caa7f9690 100644 --- a/src/main/java/org/opensearch/performanceanalyzer/rca/store/OpenSearchAnalysisGraph.java +++ b/src/main/java/org/opensearch/performanceanalyzer/rca/store/OpenSearchAnalysisGraph.java @@ -19,6 +19,7 @@ import org.opensearch.performanceanalyzer.decisionmaker.deciders.QueueHealthDecider; import org.opensearch.performanceanalyzer.decisionmaker.deciders.collator.Collator; import org.opensearch.performanceanalyzer.decisionmaker.deciders.jvm.HeapHealthDecider; +import org.opensearch.performanceanalyzer.decisionmaker.deciders.searchbackpressure.SearchBackPressureDecider; import org.opensearch.performanceanalyzer.metricsdb.MetricsDB; import org.opensearch.performanceanalyzer.plugins.PluginController; import org.opensearch.performanceanalyzer.plugins.PluginControllerConfig; @@ -463,8 +464,17 @@ public void construct() { RcaConsts.RcaTagConstants.TAG_AGGREGATE_UPSTREAM, RcaConsts.RcaTagConstants.LOCUS_DATA_NODE); - // To Do SearchBackPressure Decider + // SearchBackPressure Decider + SearchBackPressureDecider searchBackPressureDecider = + new SearchBackPressureDecider( + EVALUATION_INTERVAL_SECONDS, 12, searchBackPressureClusterRCA); + searchBackPressureDecider.addTag( + RcaConsts.RcaTagConstants.TAG_LOCUS, + RcaConsts.RcaTagConstants.LOCUS_CLUSTER_MANAGER_NODE); + searchBackPressureDecider.addAllUpstreams( + Collections.singletonList(searchBackPressureClusterRCA)); + // AdmissionControl RCA Decider AdmissionControlDecider admissionControlDecider = buildAdmissionControlDecider(heapUsed, heapMax); @@ -478,7 +488,8 @@ public void construct() { queueHealthDecider, cacheHealthDecider, heapHealthDecider, - admissionControlDecider); + admissionControlDecider, + searchBackPressureDecider); collator.addTag( RcaConsts.RcaTagConstants.TAG_LOCUS, RcaConsts.RcaTagConstants.LOCUS_CLUSTER_MANAGER_NODE); @@ -487,7 +498,8 @@ public void construct() { queueHealthDecider, cacheHealthDecider, heapHealthDecider, - admissionControlDecider)); + admissionControlDecider, + searchBackPressureDecider)); // Publisher - Executes decisions output from collator Publisher publisher = new Publisher(EVALUATION_INTERVAL_SECONDS, collator); From 38d66e827a04dbdf8db2869ded58a9c91b604def Mon Sep 17 00:00:00 2001 From: CoderJeffrey Date: Mon, 10 Jul 2023 17:18:32 -0700 Subject: [PATCH 34/73] Update SearchBackpressure Policy (Signed-off-by: Jeffrey Liu ujeffliu@amazon.com) Signed-off-by: CoderJeffrey --- .../searchbackpressure/SearchBackPressurePolicy.java | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/searchbackpressure/SearchBackPressurePolicy.java b/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/searchbackpressure/SearchBackPressurePolicy.java index 4e4117ecb..fce7e289d 100644 --- a/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/searchbackpressure/SearchBackPressurePolicy.java +++ b/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/searchbackpressure/SearchBackPressurePolicy.java @@ -7,6 +7,8 @@ import java.util.List; +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; import org.opensearch.performanceanalyzer.decisionmaker.actions.Action; import org.opensearch.performanceanalyzer.decisionmaker.deciders.DecisionPolicy; @@ -15,6 +17,12 @@ * achieve improved performance. */ public class SearchBackPressurePolicy implements DecisionPolicy { + private static final Logger LOG = LogManager.getLogger(SearchBackPressurePolicy.class); + + /* Specify a path to store SearchBackpressurePolicy_Autotune Stats */ + + /* TO DO: Check which settings should be modifed based on search heap shard/task cancellation stats */ + boolean searchTaskHeapThresholdShouldChange; @Override public List evaluate() { From a20d6f01a61dbd095b4b151164261dc18a2fd07c Mon Sep 17 00:00:00 2001 From: CoderJeffrey Date: Mon, 10 Jul 2023 17:29:48 -0700 Subject: [PATCH 35/73] Update SearchBackpressure Policy (Signed-off-by: Jeffrey Liu ujeffliu@amazon.com) Signed-off-by: CoderJeffrey --- .../SearchBackPressurePolicy.java | 22 +++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/searchbackpressure/SearchBackPressurePolicy.java b/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/searchbackpressure/SearchBackPressurePolicy.java index fce7e289d..830a4cbe6 100644 --- a/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/searchbackpressure/SearchBackPressurePolicy.java +++ b/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/searchbackpressure/SearchBackPressurePolicy.java @@ -9,8 +9,12 @@ import java.util.List; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; +import org.opensearch.performanceanalyzer.AppContext; import org.opensearch.performanceanalyzer.decisionmaker.actions.Action; import org.opensearch.performanceanalyzer.decisionmaker.deciders.DecisionPolicy; +import org.opensearch.performanceanalyzer.decisionmaker.deciders.configs.searchbackpressure.SearchBackPressurePolicyConfig; +import org.opensearch.performanceanalyzer.rca.framework.core.RcaConf; +import org.opensearch.performanceanalyzer.rca.store.rca.searchbackpressure.SearchBackPressureClusterRCA; /** * Decides if the SearchBackPressure threshold should be modified suggests actions to take to @@ -24,8 +28,26 @@ public class SearchBackPressurePolicy implements DecisionPolicy { /* TO DO: Check which settings should be modifed based on search heap shard/task cancellation stats */ boolean searchTaskHeapThresholdShouldChange; + private AppContext appContext; + private RcaConf rcaConf; + private SearchBackPressurePolicyConfig policyConfig; + private SearchBackPressureClusterRCA searchBackPressureClusterRCA; + + /* Hourly Alarm frequency threshold */ + private int hourlyAlarmThreshold; + + /* TODO SearchBackPressurePolicy Alarm Monitor */ + @Override public List evaluate() { return null; } + + public void setAppContext(AppContext appContext) { + this.appContext = appContext; + } + + public void setRcaConf(final RcaConf rcaConf) { + this.rcaConf = rcaConf; + } } From 3f53461ba0ff2f4b45d02440d627cad5bdd9a3c4 Mon Sep 17 00:00:00 2001 From: CoderJeffrey Date: Tue, 11 Jul 2023 09:59:14 -0700 Subject: [PATCH 36/73] Merged Main (Signed-off-by: Jeffrey Liu ujeffliu@amazon.com) Signed-off-by: CoderJeffrey --- .../deciders/searchbackpressure/SearchBackPressureDecider.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/searchbackpressure/SearchBackPressureDecider.java b/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/searchbackpressure/SearchBackPressureDecider.java index 1088b04ed..2677e25ec 100644 --- a/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/searchbackpressure/SearchBackPressureDecider.java +++ b/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/searchbackpressure/SearchBackPressureDecider.java @@ -19,7 +19,7 @@ public class SearchBackPressureDecider extends Decider { private static final Logger LOG = LogManager.getLogger(SearchBackPressureDecider.class); public static final String NAME = "SearchBackPressureDecider"; - /* TO ADD: SearchBackPressureDecider should have SeachBackPressurePolicy able to evaluate the search back pressure */ + /* TO ADD: SearchBackPressureDecider should have SeachBackPressurePolicy able to evaluate the search back pressure actions */ private int currentIteration = 0; private SearchBackPressureClusterRCA searchBackPressureClusterRCA; From da31bdcca1620d7268a27f9e5b1b9eeadc79942e Mon Sep 17 00:00:00 2001 From: CoderJeffrey Date: Tue, 11 Jul 2023 14:33:48 -0700 Subject: [PATCH 37/73] Add new AlarmMonitor (Signed-off-by: Jeffrey Liu ujeffliu@amazon.com) Signed-off-by: CoderJeffrey --- .../SearchBackPressurePolicy.java | 75 +++++++- .../SearchBpActionsAlarmMonitor.java | 160 ++++++++++++++++++ 2 files changed, 234 insertions(+), 1 deletion(-) create mode 100644 src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/searchbackpressure/SearchBpActionsAlarmMonitor.java diff --git a/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/searchbackpressure/SearchBackPressurePolicy.java b/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/searchbackpressure/SearchBackPressurePolicy.java index 830a4cbe6..104734b05 100644 --- a/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/searchbackpressure/SearchBackPressurePolicy.java +++ b/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/searchbackpressure/SearchBackPressurePolicy.java @@ -5,7 +5,11 @@ package org.opensearch.performanceanalyzer.decisionmaker.deciders.searchbackpressure; +import static org.opensearch.performanceanalyzer.rca.framework.api.summaries.ResourceUtil.SEARCHBACKPRESSURE_SHARD; +import static org.opensearch.performanceanalyzer.rca.framework.api.summaries.ResourceUtil.SEARCHBACKPRESSURE_TASK; +import com.google.common.annotations.VisibleForTesting; +import com.google.common.collect.Lists; import java.util.List; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; @@ -13,6 +17,11 @@ import org.opensearch.performanceanalyzer.decisionmaker.actions.Action; import org.opensearch.performanceanalyzer.decisionmaker.deciders.DecisionPolicy; import org.opensearch.performanceanalyzer.decisionmaker.deciders.configs.searchbackpressure.SearchBackPressurePolicyConfig; +import org.opensearch.performanceanalyzer.grpc.Resource; +import org.opensearch.performanceanalyzer.rca.framework.api.flow_units.ResourceFlowUnit; +import org.opensearch.performanceanalyzer.rca.framework.api.summaries.HotClusterSummary; +import org.opensearch.performanceanalyzer.rca.framework.api.summaries.HotNodeSummary; +import org.opensearch.performanceanalyzer.rca.framework.api.summaries.HotResourceSummary; import org.opensearch.performanceanalyzer.rca.framework.core.RcaConf; import org.opensearch.performanceanalyzer.rca.store.rca.searchbackpressure.SearchBackPressureClusterRCA; @@ -36,7 +45,71 @@ public class SearchBackPressurePolicy implements DecisionPolicy { /* Hourly Alarm frequency threshold */ private int hourlyAlarmThreshold; - /* TODO SearchBackPressurePolicy Alarm Monitor */ + /* Alarm for heap usage */ + static final List HEAP_SEARCHBP_SIGNALS = + Lists.newArrayList(SEARCHBACKPRESSURE_SHARD, SEARCHBACKPRESSURE_TASK); + + /* Hourly heap used threshold */ + @VisibleForTesting SearchBpActionsAlarmMonitor searchBackPressureHeapAlarm; + + public SearchBackPressurePolicy( + SearchBackPressureClusterRCA searchBackPressureClusterRCA, + SearchBpActionsAlarmMonitor searchBackPressureHeapAlarm) { + this.searchBackPressureClusterRCA = searchBackPressureClusterRCA; + this.searchBackPressureHeapAlarm = searchBackPressureHeapAlarm; + } + + public SearchBackPressurePolicy(SearchBackPressureClusterRCA searchBackPressureClusterRCA) { + this(searchBackPressureClusterRCA, null); + } + + /** + * records issues which the policy cares about and discards others + * + * @param issue an issue with the application + */ + private void record(HotResourceSummary issue) { + LOG.debug("SearchBackPressurePolicy#record()"); + if (HEAP_SEARCHBP_SIGNALS.contains(issue.getResource())) { + LOG.debug("Recording issue in searchBackPressureHeapAlarm"); + searchBackPressureHeapAlarm.recordIssue(); + } + } + + /** gathers and records all issues observed in the application */ + private void recordIssues() { + if (searchBackPressureClusterRCA.getFlowUnits().isEmpty()) { + return; + } + for (ResourceFlowUnit flowUnit : + searchBackPressureClusterRCA.getFlowUnits()) { + if (!flowUnit.hasResourceSummary()) { + continue; + } + HotClusterSummary clusterSummary = flowUnit.getSummary(); + for (HotNodeSummary nodeSummary : clusterSummary.getHotNodeSummaryList()) { + for (HotResourceSummary summary : nodeSummary.getHotResourceSummaryList()) { + record(summary); + } + } + } + } + + /* TO DO: Change the logic of heapThresholdIsTooSmall */ + public boolean heapThresholdIsTooSmall(){ + return !searchBackPressureHeapAlarm.isHealthy(); + } + + /* TO DO: Change the logic of heapThresholdIsTooLarge */ + public boolean heapThresholdIsTooLarge(){ + return !searchBackPressureHeapAlarm.isHealthy(); + } + + // create alarm monitor from config + + // initalize all alarm monitors + + @Override public List evaluate() { diff --git a/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/searchbackpressure/SearchBpActionsAlarmMonitor.java b/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/searchbackpressure/SearchBpActionsAlarmMonitor.java new file mode 100644 index 000000000..4f9979536 --- /dev/null +++ b/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/searchbackpressure/SearchBpActionsAlarmMonitor.java @@ -0,0 +1,160 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ + +package org.opensearch.performanceanalyzer.decisionmaker.deciders.searchbackpressure; + + +import com.google.common.annotations.VisibleForTesting; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.util.concurrent.TimeUnit; +import javax.annotation.Nullable; +import org.opensearch.performanceanalyzer.decisionmaker.deciders.AlarmMonitor; +import org.opensearch.performanceanalyzer.rca.framework.api.aggregators.BucketizedSlidingWindow; +import org.opensearch.performanceanalyzer.rca.framework.api.aggregators.BucketizedSlidingWindowConfig; +import org.opensearch.performanceanalyzer.rca.framework.api.aggregators.SlidingWindowData; + +public class SearchBpActionsAlarmMonitor implements AlarmMonitor { + /* Current design only uses hour monitor to evaluate the health of the searchbackpressure service + * if there are more than 30 bad units in one hour, then the alarm shows a Unhealthy Signal + */ + private static final int DEFAULT_HOUR_BREACH_THRESHOLD = 30; + // private static final int DEFAULT_DAY_BREACH_THRESHOLD = 1; + // private static final int DEFAULT_WEEK_BREACH_THRESHOLD = 1; + private static final String HOUR_PREFIX = "hour-"; + // private static final String DAY_PREFIX = "day-"; + // private static final String WEEK_PREFIX = "week-"; + + public static final int HOUR_MONITOR_BUCKET_WINDOW_MINUTES = 5; + // public static final int DAY_MONITOR_BUCKET_WINDOW_MINUTES = 30; + // public static final int WEEK_MONITOR_BUCKET_WINDOW_MINUTES = 86400; + + private BucketizedSlidingWindow hourMonitor; + // private BucketizedSlidingWindow dayMonitor; + // private BucketizedSlidingWindow weekMonitor; + + private int hourBreachThreshold; + // private int dayBreachThreshold; + // private int weekBreachThreshold; + + private boolean alarmHealthy = true; + + @Override + public boolean isHealthy() { + evaluateAlarm(); + return alarmHealthy; + } + + public SearchBpActionsAlarmMonitor( + int hourBreachThreshold, + @Nullable Path persistencePath, + @Nullable BucketizedSlidingWindowConfig hourMonitorConfig) { + Path hourMonitorPath = null; + if (persistencePath != null) { + Path persistenceBase = persistencePath.getParent(); + Path persistenceFile = persistencePath.getFileName(); + if (persistenceBase != null && persistenceFile != null) { + hourMonitorPath = + Paths.get( + persistenceBase.toString(), + HOUR_PREFIX + persistenceFile.toString()); + // weekMonitorPath = + // Paths.get( + // persistenceBase.toString(), + // WEEK_PREFIX + persistenceFile.toString()); + } + } + // initialize hour Monitor + if (hourMonitorConfig == null) { + hourMonitor = + new BucketizedSlidingWindow( + (int) TimeUnit.HOURS.toMinutes(1), + 5, + TimeUnit.MINUTES, + hourMonitorPath); + } else { + hourMonitor = new BucketizedSlidingWindow(hourMonitorConfig); + } + // // initialize weekMonitor + // if (weekMonitorConfig == null) { + // weekMonitor = new BucketizedSlidingWindow(4, 1, TimeUnit.DAYS, weekMonitorPath); + // } else { + // weekMonitor = new BucketizedSlidingWindow(weekMonitorConfig); + // } + this.hourBreachThreshold = hourBreachThreshold; + // this.weekBreachThreshold = weekBreachThreshold; + } + + public SearchBpActionsAlarmMonitor(int hourBreachThreshold, @Nullable Path persistencePath) { + this(hourBreachThreshold, persistencePath, null); + } + + public SearchBpActionsAlarmMonitor(int hourBreachThreshold) { + this(hourBreachThreshold, null, null); + } + + public SearchBpActionsAlarmMonitor(@Nullable Path persistencePath) { + this(DEFAULT_HOUR_BREACH_THRESHOLD, persistencePath); + } + + public SearchBpActionsAlarmMonitor() { + this(DEFAULT_HOUR_BREACH_THRESHOLD); + } + + @Override + public void recordIssue(long timeStamp, double value) { + SlidingWindowData dataPoint = new SlidingWindowData(timeStamp, value); + hourMonitor.next(dataPoint); + // // If we've breached the day threshold, record it as a bad day this week. + // if (dayMonitor.size() >= dayBreachThreshold) { + // weekMonitor.next(new SlidingWindowData(dataPoint.getTimeStamp(), + // dataPoint.getValue())); + // } + } + + private void evaluateAlarm() { + if (alarmHealthy) { + if (hourMonitor.size() >= hourBreachThreshold) { + alarmHealthy = false; + } + } else { + if (hourMonitor.size() == 0) { + alarmHealthy = true; + } + } + } + + public int getHourBreachThreshold() { + return hourBreachThreshold; + } + + // public int getDayBreachThreshold() { + // return dayBreachThreshold; + // } + + // public int getWeekBreachThreshold() { + // return weekBreachThreshold; + // } + + @VisibleForTesting + BucketizedSlidingWindow getHourMonitor() { + return hourMonitor; + } + + // @VisibleForTesting + // BucketizedSlidingWindow getDayMonitor() { + // return dayMonitor; + // } + + // @VisibleForTesting + // BucketizedSlidingWindow getWeekMonitor() { + // return weekMonitor; + // } + + @VisibleForTesting + void setAlarmHealth(boolean isHealthy) { + this.alarmHealthy = isHealthy; + } +} From 473270cfed13de434d3627c978c5475c43dc93b4 Mon Sep 17 00:00:00 2001 From: CoderJeffrey Date: Tue, 11 Jul 2023 15:58:04 -0700 Subject: [PATCH 38/73] Workable decider (Signed-off-by: Jeffrey Liu ujeffliu@amazon.com) Signed-off-by: CoderJeffrey --- .../deciders/configs/DeciderConfig.java | 13 +++ .../SearchBackPressurePolicyConfig.java | 70 ++++++++++++++- .../SearchBackPressureDecider.java | 7 +- .../SearchBackPressurePolicy.java | 86 +++++++++++++++++-- .../SearchBpActionsAlarmMonitor.java | 7 ++ 5 files changed, 173 insertions(+), 10 deletions(-) diff --git a/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/configs/DeciderConfig.java b/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/configs/DeciderConfig.java index c2b3ed444..096c45917 100644 --- a/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/configs/DeciderConfig.java +++ b/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/configs/DeciderConfig.java @@ -8,6 +8,7 @@ import org.opensearch.performanceanalyzer.decisionmaker.deciders.configs.jvm.OldGenDecisionPolicyConfig; import org.opensearch.performanceanalyzer.decisionmaker.deciders.configs.jvm.young_gen.JvmGenTuningPolicyConfig; +import org.opensearch.performanceanalyzer.decisionmaker.deciders.configs.searchbackpressure.SearchBackPressurePolicyConfig; import org.opensearch.performanceanalyzer.rca.framework.core.NestedConfig; import org.opensearch.performanceanalyzer.rca.framework.core.RcaConf; @@ -28,11 +29,14 @@ public class DeciderConfig { private static final String OLD_GEN_DECISION_POLICY_CONFIG_NAME = "old-gen-decision-policy-config"; private static final String JVM_GEN_TUNING_POLICY_CONFIG_NAME = "jvm-gen-tuning-policy-config"; + private static final String SEARCH_BACK_PRESSURE_POLICY_CONFIG_NAME = + "search-back-pressure-policy-config"; private final CachePriorityOrderConfig cachePriorityOrderConfig; private final WorkLoadTypeConfig workLoadTypeConfig; private final OldGenDecisionPolicyConfig oldGenDecisionPolicyConfig; private final JvmGenTuningPolicyConfig jvmGenTuningPolicyConfig; + private final SearchBackPressurePolicyConfig searchBackPressurePolicyConfig; public DeciderConfig(final RcaConf rcaConf) { cachePriorityOrderConfig = @@ -51,6 +55,11 @@ public DeciderConfig(final RcaConf rcaConf) { new NestedConfig( JVM_GEN_TUNING_POLICY_CONFIG_NAME, rcaConf.getDeciderConfigSettings())); + searchBackPressurePolicyConfig = + new SearchBackPressurePolicyConfig( + new NestedConfig( + SEARCH_BACK_PRESSURE_POLICY_CONFIG_NAME, + rcaConf.getDeciderConfigSettings())); } public CachePriorityOrderConfig getCachePriorityOrderConfig() { @@ -68,4 +77,8 @@ public OldGenDecisionPolicyConfig getOldGenDecisionPolicyConfig() { public JvmGenTuningPolicyConfig getJvmGenTuningPolicyConfig() { return jvmGenTuningPolicyConfig; } + + public SearchBackPressurePolicyConfig getSearchBackPressurePolicyConfig() { + return searchBackPressurePolicyConfig; + } } diff --git a/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/configs/searchbackpressure/SearchBackPressurePolicyConfig.java b/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/configs/searchbackpressure/SearchBackPressurePolicyConfig.java index d7156ed34..e0dd7ba9d 100644 --- a/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/configs/searchbackpressure/SearchBackPressurePolicyConfig.java +++ b/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/configs/searchbackpressure/SearchBackPressurePolicyConfig.java @@ -6,7 +6,10 @@ package org.opensearch.performanceanalyzer.decisionmaker.deciders.configs.searchbackpressure; +import java.util.concurrent.TimeUnit; import org.opensearch.performanceanalyzer.decisionmaker.deciders.searchbackpressure.SearchBackPressurePolicy; +import org.opensearch.performanceanalyzer.rca.framework.core.Config; +import org.opensearch.performanceanalyzer.rca.framework.core.NestedConfig; /** * Configures various thresholds for the {@link SearchBackPressurePolicy} @@ -24,4 +27,69 @@ * search_task_stepsize_in_percentage: Defines the step size to change heap usage threshold (in * percentage) for an individual task before it is considered for cancellation. */ -public class SearchBackPressurePolicyConfig {} +public class SearchBackPressurePolicyConfig { + // Field Names + private static final String ENABLED = "enabled"; + private static final String HOUR_BREACH_THRESHOLD = "hour-threshold"; + private static final String THRESHOLD_COUNT = "threshold_count"; + private static final String HOUR_MONITOR_WINDOW_SIZE_MINUTES = + "hour-monitor-window-size-minutes"; + private static final String HOUR_MONITOR_BUCKET_SIZE_MINUTES = + "hour-monitor-bucket-size-minutes"; + + // Default values + public static final boolean DEFAULT_ENABLED = true; + public static final int DEFAULT_HOUR_BREACH_THRESHOLD = 30; + public static final int DEFAULT_HOUR_MONITOR_WINDOW_SIZE_MINUTES = + (int) TimeUnit.HOURS.toMinutes(1); + public static final int DEFAULT_HOUR_MONITOR_BUCKET_SIZE_MINUTES = 5; + + private Config hourBreachThreshold; + private Config enabled; + private Config hourMonitorWindowSizeMinutes; + private Config hourMonitorBucketSizeMinutes; + + public SearchBackPressurePolicyConfig(NestedConfig config) { + enabled = new Config<>(ENABLED, config.getValue(), DEFAULT_ENABLED, Boolean.class); + hourBreachThreshold = + new Config<>( + HOUR_BREACH_THRESHOLD, + config.getValue(), + DEFAULT_HOUR_BREACH_THRESHOLD, + Integer.class); + hourMonitorWindowSizeMinutes = + new Config<>( + HOUR_MONITOR_WINDOW_SIZE_MINUTES, + config.getValue(), + DEFAULT_HOUR_MONITOR_WINDOW_SIZE_MINUTES, + Integer.class); + + hourMonitorBucketSizeMinutes = + new Config<>( + HOUR_MONITOR_BUCKET_SIZE_MINUTES, + config.getValue(), + DEFAULT_HOUR_MONITOR_BUCKET_SIZE_MINUTES, + Integer.class); + } + + /** + * Whether or not to enable the policy. A disabled policy will not emit any actions. + * + * @return Whether or not to enable the policy + */ + public boolean isEnabled() { + return enabled.getValue(); + } + + public int getHourBreachThreshold() { + return hourBreachThreshold.getValue(); + } + + public int getHourMonitorWindowSizeMinutes() { + return hourMonitorWindowSizeMinutes.getValue(); + } + + public int getHourMonitorBucketSizeMinutes() { + return hourMonitorBucketSizeMinutes.getValue(); + } +} diff --git a/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/searchbackpressure/SearchBackPressureDecider.java b/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/searchbackpressure/SearchBackPressureDecider.java index 2677e25ec..3a9242129 100644 --- a/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/searchbackpressure/SearchBackPressureDecider.java +++ b/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/searchbackpressure/SearchBackPressureDecider.java @@ -20,6 +20,7 @@ public class SearchBackPressureDecider extends Decider { public static final String NAME = "SearchBackPressureDecider"; /* TO ADD: SearchBackPressureDecider should have SeachBackPressurePolicy able to evaluate the search back pressure actions */ + // SearchBackPressurePolicy searchBackPressurePolicy; private int currentIteration = 0; private SearchBackPressureClusterRCA searchBackPressureClusterRCA; @@ -30,7 +31,9 @@ public SearchBackPressureDecider( SearchBackPressureClusterRCA searchBackPressureClusterRCA) { super(evalIntervalSeconds, decisionFrequency); this.searchBackPressureClusterRCA = searchBackPressureClusterRCA; - LOG.info("SearchBackPressureDecider created"); + // this.searchBackPressurePolicy = new + // SearchBackPressurePolicy(searchBackPressureClusterRCA); + LOG.info("SearchBackPressureDecider created#2"); } @Override @@ -51,7 +54,7 @@ public Decision operate() { currentIteration = 0; // SearchBackPressure Policy is always accepted - // List searchBackPressureActions = seart.evaluate(); + // List searchBackPressureActions = searchBackPressurePolicy.evaluate(); // oldGenPolicyActions.forEach(decision::addAction); return decision; } diff --git a/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/searchbackpressure/SearchBackPressurePolicy.java b/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/searchbackpressure/SearchBackPressurePolicy.java index 104734b05..16a15ddc5 100644 --- a/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/searchbackpressure/SearchBackPressurePolicy.java +++ b/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/searchbackpressure/SearchBackPressurePolicy.java @@ -10,11 +10,15 @@ import com.google.common.annotations.VisibleForTesting; import com.google.common.collect.Lists; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.util.ArrayList; import java.util.List; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; import org.opensearch.performanceanalyzer.AppContext; import org.opensearch.performanceanalyzer.decisionmaker.actions.Action; +import org.opensearch.performanceanalyzer.decisionmaker.actions.SearchBackPressureAction; import org.opensearch.performanceanalyzer.decisionmaker.deciders.DecisionPolicy; import org.opensearch.performanceanalyzer.decisionmaker.deciders.configs.searchbackpressure.SearchBackPressurePolicyConfig; import org.opensearch.performanceanalyzer.grpc.Resource; @@ -23,6 +27,7 @@ import org.opensearch.performanceanalyzer.rca.framework.api.summaries.HotNodeSummary; import org.opensearch.performanceanalyzer.rca.framework.api.summaries.HotResourceSummary; import org.opensearch.performanceanalyzer.rca.framework.core.RcaConf; +import org.opensearch.performanceanalyzer.rca.framework.util.RcaConsts; import org.opensearch.performanceanalyzer.rca.store.rca.searchbackpressure.SearchBackPressureClusterRCA; /** @@ -32,6 +37,12 @@ public class SearchBackPressurePolicy implements DecisionPolicy { private static final Logger LOG = LogManager.getLogger(SearchBackPressurePolicy.class); + // TO DO READ THE cool off period from the config file of the action + private static final long DEAFULT_COOLOFF_PERIOD_IN_MILLIS = 60L * 60L * 1000L; + + private static final Path SEARCHBP_DATA_FILE_PATH = + Paths.get(RcaConsts.CONFIG_DIR_PATH, "SearchBackPressurePolicy_heap"); + /* Specify a path to store SearchBackpressurePolicy_Autotune Stats */ /* TO DO: Check which settings should be modifed based on search heap shard/task cancellation stats */ @@ -69,15 +80,18 @@ public SearchBackPressurePolicy(SearchBackPressureClusterRCA searchBackPressureC * @param issue an issue with the application */ private void record(HotResourceSummary issue) { - LOG.debug("SearchBackPressurePolicy#record()"); + LOG.info("SearchBackPressurePolicy#record()"); if (HEAP_SEARCHBP_SIGNALS.contains(issue.getResource())) { - LOG.debug("Recording issue in searchBackPressureHeapAlarm"); + LOG.info( + "Recording issue in searchBackPressureHeapAlarm since Resource Searchbp Shard or Task appears"); searchBackPressureHeapAlarm.recordIssue(); } } /** gathers and records all issues observed in the application */ private void recordIssues() { + LOG.info("SearchBackPressurePolicy#recordIssues()"); + if (searchBackPressureClusterRCA.getFlowUnits().isEmpty()) { return; } @@ -96,24 +110,82 @@ private void recordIssues() { } /* TO DO: Change the logic of heapThresholdIsTooSmall */ - public boolean heapThresholdIsTooSmall(){ + public boolean heapThresholdIsTooSmall() { return !searchBackPressureHeapAlarm.isHealthy(); } /* TO DO: Change the logic of heapThresholdIsTooLarge */ - public boolean heapThresholdIsTooLarge(){ + public boolean heapThresholdIsTooLarge() { return !searchBackPressureHeapAlarm.isHealthy(); } // create alarm monitor from config + public SearchBpActionsAlarmMonitor createAlarmMonitor(Path persistenceBasePath) { + // BucketizedSlidingWindowConfig hourMonitorConfig = + // new BucketizedSlidingWindowConfig( + // policyConfig.getDayMonitorWindowSizeMinutes(), + // policyConfig.getDayMonitorBucketSizeMinutes(), + // TimeUnit.MINUTES, + // persistenceBasePath); + + return new SearchBpActionsAlarmMonitor(); + } // initalize all alarm monitors - - + public void initialize() { + LOG.info("Initializing alarms with dummy path"); + if (searchBackPressureHeapAlarm == null) { + searchBackPressureHeapAlarm = createAlarmMonitor(SEARCHBP_DATA_FILE_PATH); + } + } @Override public List evaluate() { - return null; + LOG.info("Evaluating SearchBackPressurePolicy Evaluate() called"); + List actions = new ArrayList<>(); + if (rcaConf == null || appContext == null) { + LOG.error("rca conf/app context is null, return empty action list"); + return actions; + } + + policyConfig = rcaConf.getDeciderConfig().getSearchBackPressurePolicyConfig(); + if (!policyConfig.isEnabled()) { + LOG.debug("SearchBackPressurePolicy is disabled"); + return actions; + } + + initialize(); + LOG.info( + "searchBackPressureHeapAlarm#hour breach threshold is {}", + searchBackPressureHeapAlarm.getHourBreachThreshold()); + + recordIssues(); + + if (heapThresholdIsTooSmall()) { + LOG.info( + "SearchBackPressurePolicy#evaluate() heap usage need to be autotuned. Action Added!"); + // suggest the downstream cls to modify heap usgae threshold + actions.add( + new SearchBackPressureAction( + appContext, + true, + DEAFULT_COOLOFF_PERIOD_IN_MILLIS, + "heap_usage", + 75.0, + 70)); + } + + // else if (youngGenerationIsTooSmall()) { + // LOG.debug("The young generation is too small!"); + // int newRatio = computeIncrease(getCurrentRatio()); + // if (newRatio >= 1) { + // LOG.debug("Adding new JvmGenAction with ratio {}", newRatio); + // actions.add(new JvmGenAction(appContext, newRatio, COOLOFF_PERIOD_IN_MILLIS, + // true)); + // } + // } + + return actions; } public void setAppContext(AppContext appContext) { diff --git a/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/searchbackpressure/SearchBpActionsAlarmMonitor.java b/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/searchbackpressure/SearchBpActionsAlarmMonitor.java index 4f9979536..e51921d26 100644 --- a/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/searchbackpressure/SearchBpActionsAlarmMonitor.java +++ b/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/searchbackpressure/SearchBpActionsAlarmMonitor.java @@ -11,12 +11,15 @@ import java.nio.file.Paths; import java.util.concurrent.TimeUnit; import javax.annotation.Nullable; +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; import org.opensearch.performanceanalyzer.decisionmaker.deciders.AlarmMonitor; import org.opensearch.performanceanalyzer.rca.framework.api.aggregators.BucketizedSlidingWindow; import org.opensearch.performanceanalyzer.rca.framework.api.aggregators.BucketizedSlidingWindowConfig; import org.opensearch.performanceanalyzer.rca.framework.api.aggregators.SlidingWindowData; public class SearchBpActionsAlarmMonitor implements AlarmMonitor { + private static final Logger LOG = LogManager.getLogger(SearchBpActionsAlarmMonitor.class); /* Current design only uses hour monitor to evaluate the health of the searchbackpressure service * if there are more than 30 bad units in one hour, then the alarm shows a Unhealthy Signal */ @@ -117,6 +120,10 @@ public void recordIssue(long timeStamp, double value) { private void evaluateAlarm() { if (alarmHealthy) { if (hourMonitor.size() >= hourBreachThreshold) { + LOG.info( + "Search Backpressure Actions Alarm is Unhealthy because hourMonitor.size() is {}, and threshold is {}", + hourMonitor.size(), + hourBreachThreshold); alarmHealthy = false; } } else { From 59f47c82437ff9f68e12848cc06efe2f561c0fad Mon Sep 17 00:00:00 2001 From: CoderJeffrey Date: Tue, 11 Jul 2023 16:28:40 -0700 Subject: [PATCH 39/73] Workable decider (Signed-off-by: Jeffrey Liu ujeffliu@amazon.com) Signed-off-by: CoderJeffrey --- .../SearchBackPressureDecider.java | 17 ++++++++++------- .../SearchBackPressurePolicy.java | 11 +++++++++-- .../SearchBpActionsAlarmMonitor.java | 2 +- 3 files changed, 20 insertions(+), 10 deletions(-) diff --git a/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/searchbackpressure/SearchBackPressureDecider.java b/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/searchbackpressure/SearchBackPressureDecider.java index 3a9242129..210d230d4 100644 --- a/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/searchbackpressure/SearchBackPressureDecider.java +++ b/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/searchbackpressure/SearchBackPressureDecider.java @@ -6,9 +6,11 @@ package org.opensearch.performanceanalyzer.decisionmaker.deciders.searchbackpressure; +import java.util.List; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; import org.opensearch.performanceanalyzer.AppContext; +import org.opensearch.performanceanalyzer.decisionmaker.actions.Action; import org.opensearch.performanceanalyzer.decisionmaker.deciders.Decider; import org.opensearch.performanceanalyzer.decisionmaker.deciders.Decision; import org.opensearch.performanceanalyzer.rca.framework.core.RcaConf; @@ -20,7 +22,7 @@ public class SearchBackPressureDecider extends Decider { public static final String NAME = "SearchBackPressureDecider"; /* TO ADD: SearchBackPressureDecider should have SeachBackPressurePolicy able to evaluate the search back pressure actions */ - // SearchBackPressurePolicy searchBackPressurePolicy; + SearchBackPressurePolicy searchBackPressurePolicy; private int currentIteration = 0; private SearchBackPressureClusterRCA searchBackPressureClusterRCA; @@ -31,8 +33,7 @@ public SearchBackPressureDecider( SearchBackPressureClusterRCA searchBackPressureClusterRCA) { super(evalIntervalSeconds, decisionFrequency); this.searchBackPressureClusterRCA = searchBackPressureClusterRCA; - // this.searchBackPressurePolicy = new - // SearchBackPressurePolicy(searchBackPressureClusterRCA); + this.searchBackPressurePolicy = new SearchBackPressurePolicy(searchBackPressureClusterRCA); LOG.info("SearchBackPressureDecider created#2"); } @@ -43,7 +44,9 @@ public String name() { @Override public Decision operate() { - LOG.info("SearchBackPressureDecider operate() with currentIteration: {}", currentIteration); + LOG.info( + "SearchBackPressureDecider#2 operate() with currentIteration: {}", + currentIteration); Decision decision = new Decision(System.currentTimeMillis(), NAME); currentIteration += 1; @@ -54,7 +57,7 @@ public Decision operate() { currentIteration = 0; // SearchBackPressure Policy is always accepted - // List searchBackPressureActions = searchBackPressurePolicy.evaluate(); + List searchBackPressureActions = searchBackPressurePolicy.evaluate(); // oldGenPolicyActions.forEach(decision::addAction); return decision; } @@ -63,13 +66,13 @@ public Decision operate() { @Override public void readRcaConf(RcaConf conf) { super.readRcaConf(conf); - // oldGenDecisionPolicy.setRcaConf(conf); + searchBackPressurePolicy.setRcaConf(conf); } /* Set AppContext for SearchBackPressurePolicy */ @Override public void setAppContext(final AppContext appContext) { super.setAppContext(appContext); - // oldGenDecisionPolicy.setAppContext(appContext); + searchBackPressurePolicy.setAppContext(appContext); } } diff --git a/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/searchbackpressure/SearchBackPressurePolicy.java b/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/searchbackpressure/SearchBackPressurePolicy.java index 16a15ddc5..420f1c70d 100644 --- a/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/searchbackpressure/SearchBackPressurePolicy.java +++ b/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/searchbackpressure/SearchBackPressurePolicy.java @@ -68,6 +68,7 @@ public SearchBackPressurePolicy( SearchBpActionsAlarmMonitor searchBackPressureHeapAlarm) { this.searchBackPressureClusterRCA = searchBackPressureClusterRCA; this.searchBackPressureHeapAlarm = searchBackPressureHeapAlarm; + LOG.info("SearchBackPressurePolicy#SearchBackPressurePolicy() initialize"); } public SearchBackPressurePolicy(SearchBackPressureClusterRCA searchBackPressureClusterRCA) { @@ -95,14 +96,20 @@ private void recordIssues() { if (searchBackPressureClusterRCA.getFlowUnits().isEmpty()) { return; } + int test_counter = 0; for (ResourceFlowUnit flowUnit : searchBackPressureClusterRCA.getFlowUnits()) { if (!flowUnit.hasResourceSummary()) { continue; } + // print out the total number of flow units in length HotClusterSummary clusterSummary = flowUnit.getSummary(); for (HotNodeSummary nodeSummary : clusterSummary.getHotNodeSummaryList()) { for (HotResourceSummary summary : nodeSummary.getHotResourceSummaryList()) { + test_counter += 1; + LOG.info( + "SearchBackPressurePolicy#recordIssues() Summary test_counter: " + + test_counter); record(summary); } } @@ -141,7 +148,7 @@ public void initialize() { @Override public List evaluate() { - LOG.info("Evaluating SearchBackPressurePolicy Evaluate() called"); + LOG.info("---------------evaluate() called"); List actions = new ArrayList<>(); if (rcaConf == null || appContext == null) { LOG.error("rca conf/app context is null, return empty action list"); @@ -150,7 +157,7 @@ public List evaluate() { policyConfig = rcaConf.getDeciderConfig().getSearchBackPressurePolicyConfig(); if (!policyConfig.isEnabled()) { - LOG.debug("SearchBackPressurePolicy is disabled"); + LOG.info("SearchBackPressurePolicy is disabled"); return actions; } diff --git a/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/searchbackpressure/SearchBpActionsAlarmMonitor.java b/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/searchbackpressure/SearchBpActionsAlarmMonitor.java index e51921d26..3afde3461 100644 --- a/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/searchbackpressure/SearchBpActionsAlarmMonitor.java +++ b/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/searchbackpressure/SearchBpActionsAlarmMonitor.java @@ -23,7 +23,7 @@ public class SearchBpActionsAlarmMonitor implements AlarmMonitor { /* Current design only uses hour monitor to evaluate the health of the searchbackpressure service * if there are more than 30 bad units in one hour, then the alarm shows a Unhealthy Signal */ - private static final int DEFAULT_HOUR_BREACH_THRESHOLD = 30; + private static final int DEFAULT_HOUR_BREACH_THRESHOLD = 2; // private static final int DEFAULT_DAY_BREACH_THRESHOLD = 1; // private static final int DEFAULT_WEEK_BREACH_THRESHOLD = 1; private static final String HOUR_PREFIX = "hour-"; From 015593c4ec1ea7f27dcfb88c164b894d38786b91 Mon Sep 17 00:00:00 2001 From: CoderJeffrey Date: Tue, 11 Jul 2023 16:56:49 -0700 Subject: [PATCH 40/73] Workable pipeline (Signed-off-by: Jeffrey Liu ujeffliu@amazon.com) Signed-off-by: CoderJeffrey --- .../searchbackpressure/SearchBackPressurePolicy.java | 3 ++- .../searchbackpressure/SearchBpActionsAlarmMonitor.java | 6 +++++- .../rca/configs/SearchBackPressureRcaConfig.java | 2 +- 3 files changed, 8 insertions(+), 3 deletions(-) diff --git a/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/searchbackpressure/SearchBackPressurePolicy.java b/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/searchbackpressure/SearchBackPressurePolicy.java index 420f1c70d..63b79c5fc 100644 --- a/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/searchbackpressure/SearchBackPressurePolicy.java +++ b/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/searchbackpressure/SearchBackPressurePolicy.java @@ -94,6 +94,8 @@ private void recordIssues() { LOG.info("SearchBackPressurePolicy#recordIssues()"); if (searchBackPressureClusterRCA.getFlowUnits().isEmpty()) { + LOG.info( + "SearchBackPressurePolicy#recordIssues() No flow units in searchBackPressureClusterRCA"); return; } int test_counter = 0; @@ -148,7 +150,6 @@ public void initialize() { @Override public List evaluate() { - LOG.info("---------------evaluate() called"); List actions = new ArrayList<>(); if (rcaConf == null || appContext == null) { LOG.error("rca conf/app context is null, return empty action list"); diff --git a/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/searchbackpressure/SearchBpActionsAlarmMonitor.java b/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/searchbackpressure/SearchBpActionsAlarmMonitor.java index 3afde3461..54b038dcc 100644 --- a/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/searchbackpressure/SearchBpActionsAlarmMonitor.java +++ b/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/searchbackpressure/SearchBpActionsAlarmMonitor.java @@ -74,7 +74,7 @@ public SearchBpActionsAlarmMonitor( hourMonitor = new BucketizedSlidingWindow( (int) TimeUnit.HOURS.toMinutes(1), - 5, + 1, TimeUnit.MINUTES, hourMonitorPath); } else { @@ -109,6 +109,7 @@ public SearchBpActionsAlarmMonitor() { @Override public void recordIssue(long timeStamp, double value) { SlidingWindowData dataPoint = new SlidingWindowData(timeStamp, value); + LOG.info("Search Backpressure Actions Alarm is recording a new issue at {}", timeStamp); hourMonitor.next(dataPoint); // // If we've breached the day threshold, record it as a bad day this week. // if (dayMonitor.size() >= dayBreachThreshold) { @@ -119,6 +120,7 @@ public void recordIssue(long timeStamp, double value) { private void evaluateAlarm() { if (alarmHealthy) { + LOG.info("Alarm healthy with hourMonitor.size() = {}", hourMonitor.size()); if (hourMonitor.size() >= hourBreachThreshold) { LOG.info( "Search Backpressure Actions Alarm is Unhealthy because hourMonitor.size() is {}, and threshold is {}", @@ -127,7 +129,9 @@ private void evaluateAlarm() { alarmHealthy = false; } } else { + LOG.info("Alarm not healthy"); if (hourMonitor.size() == 0) { + LOG.info("SearchBackpressure Hour Monitor is healthy for zero capacity"); alarmHealthy = true; } } diff --git a/src/main/java/org/opensearch/performanceanalyzer/rca/configs/SearchBackPressureRcaConfig.java b/src/main/java/org/opensearch/performanceanalyzer/rca/configs/SearchBackPressureRcaConfig.java index 8838ce12f..56672da1e 100644 --- a/src/main/java/org/opensearch/performanceanalyzer/rca/configs/SearchBackPressureRcaConfig.java +++ b/src/main/java/org/opensearch/performanceanalyzer/rca/configs/SearchBackPressureRcaConfig.java @@ -21,7 +21,7 @@ public class SearchBackPressureRcaConfig { public static final long DEFAULT_EVALUATION_INTERVAL_IN_S = 60; /* interval period to call operate() */ - public static final long EVAL_INTERVAL_IN_S = 5; + public static final long EVAL_INTERVAL_IN_S = 2; /* Increase Threshold */ // node max heap usage in last 60 secs is less than 70% From c8dbb2ceef361ba89630feecb6ccdae048960cc7 Mon Sep 17 00:00:00 2001 From: CoderJeffrey Date: Tue, 11 Jul 2023 17:01:36 -0700 Subject: [PATCH 41/73] Workable pipeline (Signed-off-by: Jeffrey Liu ujeffliu@amazon.com) Signed-off-by: CoderJeffrey --- .../searchbackpressure/SearchBackPressureDecider.java | 4 +++- .../searchbackpressure/SearchBpActionsAlarmMonitor.java | 3 +++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/searchbackpressure/SearchBackPressureDecider.java b/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/searchbackpressure/SearchBackPressureDecider.java index 210d230d4..a053dc845 100644 --- a/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/searchbackpressure/SearchBackPressureDecider.java +++ b/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/searchbackpressure/SearchBackPressureDecider.java @@ -58,7 +58,9 @@ public Decision operate() { // SearchBackPressure Policy is always accepted List searchBackPressureActions = searchBackPressurePolicy.evaluate(); - // oldGenPolicyActions.forEach(decision::addAction); + searchBackPressureActions.forEach(decision::addAction); + + LOG.info("decision action size is {}", decision.getActions().size()); return decision; } diff --git a/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/searchbackpressure/SearchBpActionsAlarmMonitor.java b/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/searchbackpressure/SearchBpActionsAlarmMonitor.java index 54b038dcc..2dc130156 100644 --- a/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/searchbackpressure/SearchBpActionsAlarmMonitor.java +++ b/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/searchbackpressure/SearchBpActionsAlarmMonitor.java @@ -71,6 +71,8 @@ public SearchBpActionsAlarmMonitor( } // initialize hour Monitor if (hourMonitorConfig == null) { + // Bucket Window Size means the number of issues can exists in a bucket + // when you consider about the size of the BucketizedSlidingWindow, the size is the number of buckets, not issues hourMonitor = new BucketizedSlidingWindow( (int) TimeUnit.HOURS.toMinutes(1), @@ -86,6 +88,7 @@ public SearchBpActionsAlarmMonitor( // } else { // weekMonitor = new BucketizedSlidingWindow(weekMonitorConfig); // } + this.hourBreachThreshold = hourBreachThreshold; // this.weekBreachThreshold = weekBreachThreshold; } From 68f27c656c7fff6f5be6a8ef3e6957bea5fee0e4 Mon Sep 17 00:00:00 2001 From: CoderJeffrey Date: Wed, 12 Jul 2023 11:15:44 -0700 Subject: [PATCH 42/73] Update ActionPojo (Signed-off-by: Jeffrey Liu ujeffliu@amazon.com) Signed-off-by: CoderJeffrey --- .../actions/SearchBackPressureAction.java | 15 +++++---------- .../SearchBpActionsAlarmMonitor.java | 3 ++- 2 files changed, 7 insertions(+), 11 deletions(-) diff --git a/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/actions/SearchBackPressureAction.java b/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/actions/SearchBackPressureAction.java index 0449d915d..4cb275e41 100644 --- a/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/actions/SearchBackPressureAction.java +++ b/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/actions/SearchBackPressureAction.java @@ -22,12 +22,6 @@ public class SearchBackPressureAction extends SuppressibleAction { private static final Logger LOG = LogManager.getLogger(SearchBackPressureAction.class); public static final String NAME = "SearchBackPressureAction"; - private static final String ID_KEY = "Id"; - private static final String IP_KEY = "Ip"; - - /* placeholder for dummy impactVector - * TODO: Remove - */ private static final ImpactVector NO_IMPACT = new ImpactVector(); /* TO DO: Discuss the default cool off period for SearchBackPressureAction @@ -36,11 +30,11 @@ public class SearchBackPressureAction extends SuppressibleAction { */ private static final long DEFAULT_COOL_OFF_PERIOD_IN_MILLIS = TimeUnit.HOURS.toMillis(1); - // step size in percent /* From Config Per Diumension Type + * TO DO: what to put in the config file * Dimension should include all the settings dimension (e.g. node_duress.cpu_threshold, search_heap_threshold) * Step Size in percentage - * NOT to Node Level but for whole service (so all data node instances) + * cool off period * canUpdate means whether the action should be emitted */ private final String searchbpDimension; @@ -95,8 +89,9 @@ public List impactedNodes() { public Map impact() { Map impact = new HashMap<>(); for (NodeKey key : impactedNodes()) { - /* TODO: Impact Logic for SearchBackPressureAction */ - // ImpactVector impactVector = new ImpactVector(); + // Since SearchBackPressure Service only modify the threshold rather than general + // resources like CPU/Heap + // So there is no impact on the dimensions impact.put(key, NO_IMPACT); } return impact; diff --git a/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/searchbackpressure/SearchBpActionsAlarmMonitor.java b/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/searchbackpressure/SearchBpActionsAlarmMonitor.java index 2dc130156..ef86a8d85 100644 --- a/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/searchbackpressure/SearchBpActionsAlarmMonitor.java +++ b/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/searchbackpressure/SearchBpActionsAlarmMonitor.java @@ -72,7 +72,8 @@ public SearchBpActionsAlarmMonitor( // initialize hour Monitor if (hourMonitorConfig == null) { // Bucket Window Size means the number of issues can exists in a bucket - // when you consider about the size of the BucketizedSlidingWindow, the size is the number of buckets, not issues + // when you consider about the size of the BucketizedSlidingWindow, the size is the + // number of buckets, not issues hourMonitor = new BucketizedSlidingWindow( (int) TimeUnit.HOURS.toMinutes(1), From 4e5686306d9211a50435692e5c68aab2d7e85ddf Mon Sep 17 00:00:00 2001 From: CoderJeffrey Date: Wed, 12 Jul 2023 17:09:34 -0700 Subject: [PATCH 43/73] Framework can read from config (Signed-off-by: Jeffrey Liu ujeffliu@amazon.com) Signed-off-by: CoderJeffrey --- .../actions/SearchBackPressureAction.java | 6 +++- .../SearchBackPressurePolicyConfig.java | 14 ++++++++-- .../SearchBackPressurePolicy.java | 28 +++++++++++++------ .../SearchBpActionsAlarmMonitor.java | 19 ++++++++----- 4 files changed, 48 insertions(+), 19 deletions(-) diff --git a/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/actions/SearchBackPressureAction.java b/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/actions/SearchBackPressureAction.java index 4cb275e41..686fbbc64 100644 --- a/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/actions/SearchBackPressureAction.java +++ b/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/actions/SearchBackPressureAction.java @@ -31,7 +31,7 @@ public class SearchBackPressureAction extends SuppressibleAction { private static final long DEFAULT_COOL_OFF_PERIOD_IN_MILLIS = TimeUnit.HOURS.toMillis(1); /* From Config Per Diumension Type - * TO DO: what to put in the config file + * TO DO: what to put in the config file * Dimension should include all the settings dimension (e.g. node_duress.cpu_threshold, search_heap_threshold) * Step Size in percentage * cool off period @@ -210,6 +210,10 @@ public Summary( this.canUpdate = canUpdate; } + /* + * Dimension is the name of the setting to be modified + * e.g. node_duress.cpu_threshold, node_duress.search_heap_threshold + */ public String getSearchbpSettingDimension() { return this.searchbpSettingDimension; } diff --git a/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/configs/searchbackpressure/SearchBackPressurePolicyConfig.java b/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/configs/searchbackpressure/SearchBackPressurePolicyConfig.java index e0dd7ba9d..65b8624aa 100644 --- a/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/configs/searchbackpressure/SearchBackPressurePolicyConfig.java +++ b/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/configs/searchbackpressure/SearchBackPressurePolicyConfig.java @@ -7,6 +7,8 @@ import java.util.concurrent.TimeUnit; +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; import org.opensearch.performanceanalyzer.decisionmaker.deciders.searchbackpressure.SearchBackPressurePolicy; import org.opensearch.performanceanalyzer.rca.framework.core.Config; import org.opensearch.performanceanalyzer.rca.framework.core.NestedConfig; @@ -28,9 +30,11 @@ * percentage) for an individual task before it is considered for cancellation. */ public class SearchBackPressurePolicyConfig { + private static final Logger LOG = LogManager.getLogger(SearchBackPressurePolicyConfig.class); + // Field Names private static final String ENABLED = "enabled"; - private static final String HOUR_BREACH_THRESHOLD = "hour-threshold"; + private static final String HOUR_BREACH_THRESHOLD = "hour-breach-threshold"; private static final String THRESHOLD_COUNT = "threshold_count"; private static final String HOUR_MONITOR_WINDOW_SIZE_MINUTES = "hour-monitor-window-size-minutes"; @@ -39,7 +43,8 @@ public class SearchBackPressurePolicyConfig { // Default values public static final boolean DEFAULT_ENABLED = true; - public static final int DEFAULT_HOUR_BREACH_THRESHOLD = 30; + // TO DO Decide the Defauilt Hour breach threshold + public static final int DEFAULT_HOUR_BREACH_THRESHOLD = 2; public static final int DEFAULT_HOUR_MONITOR_WINDOW_SIZE_MINUTES = (int) TimeUnit.HOURS.toMinutes(1); public static final int DEFAULT_HOUR_MONITOR_BUCKET_SIZE_MINUTES = 5; @@ -57,13 +62,16 @@ public SearchBackPressurePolicyConfig(NestedConfig config) { config.getValue(), DEFAULT_HOUR_BREACH_THRESHOLD, Integer.class); + LOG.info( + "SearchBackPressurePolicyConfig hour breach threshold is: {}", + hourBreachThreshold.getValue()); hourMonitorWindowSizeMinutes = new Config<>( HOUR_MONITOR_WINDOW_SIZE_MINUTES, config.getValue(), DEFAULT_HOUR_MONITOR_WINDOW_SIZE_MINUTES, Integer.class); - + LOG.info("hourMonitorWindowSizeMinutes is: {}", hourMonitorWindowSizeMinutes.getValue()); hourMonitorBucketSizeMinutes = new Config<>( HOUR_MONITOR_BUCKET_SIZE_MINUTES, diff --git a/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/searchbackpressure/SearchBackPressurePolicy.java b/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/searchbackpressure/SearchBackPressurePolicy.java index 63b79c5fc..987139fe4 100644 --- a/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/searchbackpressure/SearchBackPressurePolicy.java +++ b/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/searchbackpressure/SearchBackPressurePolicy.java @@ -14,6 +14,7 @@ import java.nio.file.Paths; import java.util.ArrayList; import java.util.List; +import java.util.concurrent.TimeUnit; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; import org.opensearch.performanceanalyzer.AppContext; @@ -22,6 +23,7 @@ import org.opensearch.performanceanalyzer.decisionmaker.deciders.DecisionPolicy; import org.opensearch.performanceanalyzer.decisionmaker.deciders.configs.searchbackpressure.SearchBackPressurePolicyConfig; import org.opensearch.performanceanalyzer.grpc.Resource; +import org.opensearch.performanceanalyzer.rca.framework.api.aggregators.BucketizedSlidingWindowConfig; import org.opensearch.performanceanalyzer.rca.framework.api.flow_units.ResourceFlowUnit; import org.opensearch.performanceanalyzer.rca.framework.api.summaries.HotClusterSummary; import org.opensearch.performanceanalyzer.rca.framework.api.summaries.HotNodeSummary; @@ -130,14 +132,23 @@ public boolean heapThresholdIsTooLarge() { // create alarm monitor from config public SearchBpActionsAlarmMonitor createAlarmMonitor(Path persistenceBasePath) { - // BucketizedSlidingWindowConfig hourMonitorConfig = - // new BucketizedSlidingWindowConfig( - // policyConfig.getDayMonitorWindowSizeMinutes(), - // policyConfig.getDayMonitorBucketSizeMinutes(), - // TimeUnit.MINUTES, - // persistenceBasePath); - - return new SearchBpActionsAlarmMonitor(); + // LOG the policyConfig.getHourMonitorWindowSizeMinutes() BuketSize and dahy breanch + // threhsold + LOG.info( + "createAlarmMonitor with hour window: {}, bucket size: {}, hour threshold: {}", + policyConfig.getHourMonitorWindowSizeMinutes(), + policyConfig.getHourMonitorBucketSizeMinutes(), + policyConfig.getHourBreachThreshold()); + BucketizedSlidingWindowConfig hourMonitorConfig = + new BucketizedSlidingWindowConfig( + policyConfig.getHourMonitorWindowSizeMinutes(), + policyConfig.getHourMonitorBucketSizeMinutes(), + TimeUnit.MINUTES, + persistenceBasePath); + + // TODO: Check whether we need a persistence path to write our data + // + return new SearchBpActionsAlarmMonitor(policyConfig.getHourBreachThreshold()); } // initalize all alarm monitors @@ -161,6 +172,7 @@ public List evaluate() { LOG.info("SearchBackPressurePolicy is disabled"); return actions; } + LOG.info("Evaluate() of SearchBackpressurePolicy."); initialize(); LOG.info( diff --git a/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/searchbackpressure/SearchBpActionsAlarmMonitor.java b/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/searchbackpressure/SearchBpActionsAlarmMonitor.java index ef86a8d85..e53d689f8 100644 --- a/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/searchbackpressure/SearchBpActionsAlarmMonitor.java +++ b/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/searchbackpressure/SearchBpActionsAlarmMonitor.java @@ -22,8 +22,10 @@ public class SearchBpActionsAlarmMonitor implements AlarmMonitor { private static final Logger LOG = LogManager.getLogger(SearchBpActionsAlarmMonitor.class); /* Current design only uses hour monitor to evaluate the health of the searchbackpressure service * if there are more than 30 bad units in one hour, then the alarm shows a Unhealthy Signal + * TODO: Remove 2 for testing, replace with 30 */ private static final int DEFAULT_HOUR_BREACH_THRESHOLD = 2; + private static final int DEFAULT_BUCKET_WINDOW_SIZE = 1; // private static final int DEFAULT_DAY_BREACH_THRESHOLD = 1; // private static final int DEFAULT_WEEK_BREACH_THRESHOLD = 1; private static final String HOUR_PREFIX = "hour-"; @@ -71,13 +73,15 @@ public SearchBpActionsAlarmMonitor( } // initialize hour Monitor if (hourMonitorConfig == null) { - // Bucket Window Size means the number of issues can exists in a bucket - // when you consider about the size of the BucketizedSlidingWindow, the size is the - // number of buckets, not issues + /* + * Bucket Window Size means the number of issues can exist in a bucket + * when you consider about the size of the BucketizedSlidingWindow, the size is the + * number of buckets, not issues + */ hourMonitor = new BucketizedSlidingWindow( (int) TimeUnit.HOURS.toMinutes(1), - 1, + DEFAULT_BUCKET_WINDOW_SIZE, TimeUnit.MINUTES, hourMonitorPath); } else { @@ -115,9 +119,10 @@ public void recordIssue(long timeStamp, double value) { SlidingWindowData dataPoint = new SlidingWindowData(timeStamp, value); LOG.info("Search Backpressure Actions Alarm is recording a new issue at {}", timeStamp); hourMonitor.next(dataPoint); - // // If we've breached the day threshold, record it as a bad day this week. - // if (dayMonitor.size() >= dayBreachThreshold) { - // weekMonitor.next(new SlidingWindowData(dataPoint.getTimeStamp(), + + // // If we've breached the hour threshold, record it as a bad day/ + // if (hourMonitor.size() >= hourBreachThreshold) { + // dayMonitor.next(new SlidingWindowData(dataPoint.getTimeStamp(), // dataPoint.getValue())); // } } From 1e9de379a3a26a503de4439337179325c1a7b227 Mon Sep 17 00:00:00 2001 From: CoderJeffrey Date: Thu, 13 Jul 2023 11:07:29 -0700 Subject: [PATCH 44/73] Add Policy Increase/Decrease Alarm (Signed-off-by: Jeffrey Liu ujeffliu@amazon.com) Signed-off-by: CoderJeffrey --- .../SearchBackPressurePolicy.java | 47 ++++++++++++------- 1 file changed, 31 insertions(+), 16 deletions(-) diff --git a/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/searchbackpressure/SearchBackPressurePolicy.java b/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/searchbackpressure/SearchBackPressurePolicy.java index 987139fe4..5d52f034d 100644 --- a/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/searchbackpressure/SearchBackPressurePolicy.java +++ b/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/searchbackpressure/SearchBackPressurePolicy.java @@ -39,7 +39,8 @@ public class SearchBackPressurePolicy implements DecisionPolicy { private static final Logger LOG = LogManager.getLogger(SearchBackPressurePolicy.class); - // TO DO READ THE cool off period from the config file of the action + // TO DO + // Decide the Cooloff Period for the action private static final long DEAFULT_COOLOFF_PERIOD_IN_MILLIS = 60L * 60L * 1000L; private static final Path SEARCHBP_DATA_FILE_PATH = @@ -48,8 +49,6 @@ public class SearchBackPressurePolicy implements DecisionPolicy { /* Specify a path to store SearchBackpressurePolicy_Autotune Stats */ /* TO DO: Check which settings should be modifed based on search heap shard/task cancellation stats */ - boolean searchTaskHeapThresholdShouldChange; - private AppContext appContext; private RcaConf rcaConf; private SearchBackPressurePolicyConfig policyConfig; @@ -62,14 +61,15 @@ public class SearchBackPressurePolicy implements DecisionPolicy { static final List HEAP_SEARCHBP_SIGNALS = Lists.newArrayList(SEARCHBACKPRESSURE_SHARD, SEARCHBACKPRESSURE_TASK); - /* Hourly heap used threshold */ - @VisibleForTesting SearchBpActionsAlarmMonitor searchBackPressureHeapAlarm; + /* alarm monitor per threshold per increase/decrease */ + @VisibleForTesting SearchBpActionsAlarmMonitor searchBackPressureHeapIncreaseAlarm; + @VisibleForTesting SearchBpActionsAlarmMonitor searchBackPressureHeapDecreaseAlarm; public SearchBackPressurePolicy( SearchBackPressureClusterRCA searchBackPressureClusterRCA, - SearchBpActionsAlarmMonitor searchBackPressureHeapAlarm) { + SearchBpActionsAlarmMonitor searchBackPressureHeapIncreaseAlarm) { this.searchBackPressureClusterRCA = searchBackPressureClusterRCA; - this.searchBackPressureHeapAlarm = searchBackPressureHeapAlarm; + this.searchBackPressureHeapIncreaseAlarm = searchBackPressureHeapIncreaseAlarm; LOG.info("SearchBackPressurePolicy#SearchBackPressurePolicy() initialize"); } @@ -86,8 +86,8 @@ private void record(HotResourceSummary issue) { LOG.info("SearchBackPressurePolicy#record()"); if (HEAP_SEARCHBP_SIGNALS.contains(issue.getResource())) { LOG.info( - "Recording issue in searchBackPressureHeapAlarm since Resource Searchbp Shard or Task appears"); - searchBackPressureHeapAlarm.recordIssue(); + "Recording issue in searchBackPressureHeapIncreaseAlarm since Resource Searchbp Shard or Task appears"); + searchBackPressureHeapIncreaseAlarm.recordIssue(); } } @@ -115,6 +115,7 @@ private void recordIssues() { "SearchBackPressurePolicy#recordIssues() Summary test_counter: " + test_counter); record(summary); + // TO DO: Check if we need to increase or decrease the heap threshold } } } @@ -122,12 +123,12 @@ private void recordIssues() { /* TO DO: Change the logic of heapThresholdIsTooSmall */ public boolean heapThresholdIsTooSmall() { - return !searchBackPressureHeapAlarm.isHealthy(); + return !searchBackPressureHeapIncreaseAlarm.isHealthy(); } /* TO DO: Change the logic of heapThresholdIsTooLarge */ public boolean heapThresholdIsTooLarge() { - return !searchBackPressureHeapAlarm.isHealthy(); + return !searchBackPressureHeapIncreaseAlarm.isHealthy(); } // create alarm monitor from config @@ -154,8 +155,8 @@ public SearchBpActionsAlarmMonitor createAlarmMonitor(Path persistenceBasePath) // initalize all alarm monitors public void initialize() { LOG.info("Initializing alarms with dummy path"); - if (searchBackPressureHeapAlarm == null) { - searchBackPressureHeapAlarm = createAlarmMonitor(SEARCHBP_DATA_FILE_PATH); + if (searchBackPressureHeapIncreaseAlarm == null) { + searchBackPressureHeapIncreaseAlarm = createAlarmMonitor(SEARCHBP_DATA_FILE_PATH); } } @@ -176,14 +177,14 @@ public List evaluate() { initialize(); LOG.info( - "searchBackPressureHeapAlarm#hour breach threshold is {}", - searchBackPressureHeapAlarm.getHourBreachThreshold()); + "searchBackPressureHeapIncreaseAlarm#hour breach threshold is {}", + searchBackPressureHeapIncreaseAlarm.getHourBreachThreshold()); recordIssues(); if (heapThresholdIsTooSmall()) { LOG.info( - "SearchBackPressurePolicy#evaluate() heap usage need to be autotuned. Action Added!"); + "SearchBackPressurePolicy#evaluate() heap usage need to be autotuned. raise heap suage threshold action Added!"); // suggest the downstream cls to modify heap usgae threshold actions.add( new SearchBackPressureAction( @@ -195,6 +196,20 @@ public List evaluate() { 70)); } + if (heapThresholdIsTooLarge()) { + LOG.info( + "SearchBackPressurePolicy#evaluate() heap usage need to be autotuned. drop heap suage threshold action Added!"); + // suggest the downstream cls to modify heap usgae threshold + actions.add( + new SearchBackPressureAction( + appContext, + true, + DEAFULT_COOLOFF_PERIOD_IN_MILLIS, + "heap_usage", + 65.0, + 70)); + } + // else if (youngGenerationIsTooSmall()) { // LOG.debug("The young generation is too small!"); // int newRatio = computeIncrease(getCurrentRatio()); From 84d9d65593629424c0fe6a30265a3a84d6a64a48 Mon Sep 17 00:00:00 2001 From: CoderJeffrey Date: Fri, 14 Jul 2023 11:27:10 -0700 Subject: [PATCH 45/73] Generic Framework can generate shard/task and increase/decrease actions (Signed-off-by: Jeffrey Liu ujeffliu@amazon.com) Signed-off-by: CoderJeffrey --- .../actions/SearchBackPressureAction.java | 137 +++++++------- .../SearchBackPressurePolicyConfig.java | 2 +- .../SearchBackPressurePolicy.java | 171 ++++++++++++------ .../SearchBpActionsAlarmMonitor.java | 2 +- 4 files changed, 187 insertions(+), 125 deletions(-) diff --git a/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/actions/SearchBackPressureAction.java b/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/actions/SearchBackPressureAction.java index 686fbbc64..7d6275726 100644 --- a/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/actions/SearchBackPressureAction.java +++ b/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/actions/SearchBackPressureAction.java @@ -28,34 +28,34 @@ public class SearchBackPressureAction extends SuppressibleAction { * Time to wait since last recommendation, before suggesting this action again * Needs the action config to have the cool off period for all dimension */ - private static final long DEFAULT_COOL_OFF_PERIOD_IN_MILLIS = TimeUnit.HOURS.toMillis(1); + private static final long DEFAULT_COOL_OFF_PERIOD_IN_MILLIS = TimeUnit.DAYS.toMillis(1); /* From Config Per Diumension Type - * TO DO: what to put in the config file - * Dimension should include all the settings dimension (e.g. node_duress.cpu_threshold, search_heap_threshold) - * Step Size in percentage - * cool off period - * canUpdate means whether the action should be emitted + * canUpdate: whether the action should be emitted + * coolOffPeriodInMillis: how long the CoolOffPeriod the action should before reemit + * thresholdname: the name of threshold we are tuning (e.g. node_duress.cpu_threshold, search_heap_threshold) + * dimension: shard/task (String) + * Step Size in percentage: how much should the threshold change in percentage */ - private final String searchbpDimension; - private final double desiredValue; - private final double currentValue; private boolean canUpdate; private long coolOffPeriodInMillis; + private String thresholdName; + private String dimension; + private double stepSizeInPercentage; public SearchBackPressureAction( final AppContext appContext, final boolean canUpdate, final long coolOffPeriodInMillis, - final String searchbpDimension, - final double desiredValue, - final double currentValue) { + final String thresholdName, + final String dimension, + final double stepSizeInPercentage) { super(appContext); this.canUpdate = canUpdate; this.coolOffPeriodInMillis = coolOffPeriodInMillis; - this.searchbpDimension = searchbpDimension; - this.desiredValue = desiredValue; - this.currentValue = currentValue; + this.thresholdName = thresholdName; + this.dimension = dimension; + this.stepSizeInPercentage = stepSizeInPercentage; } @Override @@ -97,49 +97,54 @@ public Map impact() { return impact; } - public double getCurrentValue() { - return this.currentValue; + public String getThresholdName() { + return thresholdName; } - public double getDesiredValue() { - return this.desiredValue; + public String getDimension() { + return dimension; + } + + public double getStepSizeInPercentage() { + return stepSizeInPercentage; } @Override public String summary() { Summary summary = new Summary( - searchbpDimension, - desiredValue, - currentValue, + thresholdName, + dimension, + stepSizeInPercentage, DEFAULT_COOL_OFF_PERIOD_IN_MILLIS, canUpdate); return summary.toJson(); } public static final class Builder { + public static final boolean DEFAULT_CAN_UPDATE = true; + private final AppContext appContext; - private final String searchbpDimension; - private Double currentValue; - private Double desiredValue; + private final String thresholdName; + private final String dimension; + private boolean canUpdate; + private double stepSizeInPercentage; private long coolOffPeriodInMillis; private Builder( final AppContext appContext, - final String searchbp_dimension, + final String thresholdName, + final String dimension, final long coolOffPeriodInMillis) { this.appContext = appContext; - this.searchbpDimension = searchbp_dimension; + this.thresholdName = thresholdName; + this.dimension = dimension; this.coolOffPeriodInMillis = coolOffPeriodInMillis; + this.canUpdate = DEFAULT_CAN_UPDATE; } - public Builder currentValue(Double currentValue) { - this.currentValue = currentValue; - return this; - } - - public Builder desiredValue(Double desiredValue) { - this.desiredValue = desiredValue; + public Builder stepSizeInPercentage(double stepSizeInPercentage) { + this.stepSizeInPercentage = stepSizeInPercentage; return this; } @@ -149,47 +154,39 @@ public Builder coolOffPeriodInMillis(long coolOffPeriodInMillis) { } public SearchBackPressureAction build() { - Boolean canUpdate = false; - /* - * if desiredValue is between 0 and 100 then canUpdate is true - * since desiredValue is valid */ - if (desiredValue != null) { - canUpdate = ((desiredValue >= 0) && (desiredValue <= 100)); - } - return new SearchBackPressureAction( appContext, canUpdate, coolOffPeriodInMillis, - searchbpDimension, - desiredValue, - currentValue); + thresholdName, + dimension, + stepSizeInPercentage); } } /* Write Static Class Summary to conver the Searchbp Action POJO to JSON Object * Key fields to be included - * 1. Dimension (name) of the Searchbp setting to be modified - * 2. CurrentValue of the setting - * 2. DesiredValue of the setting - * 3. CoolOffPeriodInMillis for the action - * 4. canUpdate (whether the action should be emitted) + * 1. ThresholdName of the Searchbp setting to be modified + * 2. Dimension of the action (Shard/Task) + * 3. StepSizeInPercentage to change the threshold + * 4. CoolOffPeriodInMillis for the action + * 5. canUpdate (whether the action should be emitted) */ public static class Summary { - public static final String SEARCHBP_SETTING_DIMENSION = "searchbp_setting_dimension"; - public static final String DESIRED_VALUE = "desiredValue"; - public static final String CURRENT_VALUE = "currentValue"; + public static final String THRESHOLD_NAME = "thresholdName"; + public static final String SEARCHBP_DIMENSION = "searchbpDimension"; + public static final String STEP_SIZE_IN_PERCENTAGE = "stepSizeInPercentage"; public static final String COOL_OFF_PERIOD = "coolOffPeriodInMillis"; public static final String CAN_UPDATE = "canUpdate"; - @SerializedName(value = SEARCHBP_SETTING_DIMENSION) - private String searchbpSettingDimension; + @SerializedName(value = THRESHOLD_NAME) + private String thresholdName; - @SerializedName(value = DESIRED_VALUE) - private double desiredValue; + @SerializedName(value = SEARCHBP_DIMENSION) + private String searchbpSettingDimension; - @SerializedName(value = CURRENT_VALUE) - private double currentValue; + @SerializedName(value = STEP_SIZE_IN_PERCENTAGE) + private double stepSizeInPercentage; @SerializedName(value = COOL_OFF_PERIOD) private long coolOffPeriodInMillis; @@ -198,32 +195,32 @@ public static class Summary { private boolean canUpdate; public Summary( + String thresholdName, String searchbpSettingDimension, - double desiredValue, - double currentValue, + double stepSizeInPercentage, long coolOffPeriodInMillis, boolean canUpdate) { + this.thresholdName = thresholdName; this.searchbpSettingDimension = searchbpSettingDimension; - this.desiredValue = desiredValue; - this.currentValue = currentValue; + this.stepSizeInPercentage = stepSizeInPercentage; this.coolOffPeriodInMillis = coolOffPeriodInMillis; this.canUpdate = canUpdate; } /* - * Dimension is the name of the setting to be modified + * ThresholdName is the name of the setting to be modified * e.g. node_duress.cpu_threshold, node_duress.search_heap_threshold */ - public String getSearchbpSettingDimension() { - return this.searchbpSettingDimension; + public String getThresholdName() { + return thresholdName; } - public double getCurrentValue() { - return this.currentValue; + public String getSearchbpSettingDimension() { + return searchbpSettingDimension; } - public double getDesiredValue() { - return this.desiredValue; + public double getStepSizeInPercentage() { + return stepSizeInPercentage; } public long getCoolOffPeriodInMillis() { diff --git a/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/configs/searchbackpressure/SearchBackPressurePolicyConfig.java b/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/configs/searchbackpressure/SearchBackPressurePolicyConfig.java index 65b8624aa..ea10fb2c6 100644 --- a/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/configs/searchbackpressure/SearchBackPressurePolicyConfig.java +++ b/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/configs/searchbackpressure/SearchBackPressurePolicyConfig.java @@ -18,7 +18,7 @@ * *

The config follows the format below "decider-config-settings": { * "search-back-pressure-policy-config": { "enabled": true, // whether the - * serch-back-pressure-policy should be enabled "hour-threshold": 30, // threshold for hourly + * serch-back-pressure-policy should be enabled "hour-breach-threshold": 30, // threshold for hourly * received unhealthy cluster level rca flow units, if above, then the below thresholds should be * modified "threshold_count": 2, // how many thresholds to be changed, in this case * search-heap-threshold and search-task-heap-threshold "search_task_heap_stepsize_in_percentage": diff --git a/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/searchbackpressure/SearchBackPressurePolicy.java b/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/searchbackpressure/SearchBackPressurePolicy.java index 5d52f034d..bb7a2523b 100644 --- a/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/searchbackpressure/SearchBackPressurePolicy.java +++ b/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/searchbackpressure/SearchBackPressurePolicy.java @@ -23,6 +23,7 @@ import org.opensearch.performanceanalyzer.decisionmaker.deciders.DecisionPolicy; import org.opensearch.performanceanalyzer.decisionmaker.deciders.configs.searchbackpressure.SearchBackPressurePolicyConfig; import org.opensearch.performanceanalyzer.grpc.Resource; +import org.opensearch.performanceanalyzer.rca.configs.SearchBackPressureRcaConfig; import org.opensearch.performanceanalyzer.rca.framework.api.aggregators.BucketizedSlidingWindowConfig; import org.opensearch.performanceanalyzer.rca.framework.api.flow_units.ResourceFlowUnit; import org.opensearch.performanceanalyzer.rca.framework.api.summaries.HotClusterSummary; @@ -39,16 +40,18 @@ public class SearchBackPressurePolicy implements DecisionPolicy { private static final Logger LOG = LogManager.getLogger(SearchBackPressurePolicy.class); - // TO DO - // Decide the Cooloff Period for the action - private static final long DEAFULT_COOLOFF_PERIOD_IN_MILLIS = 60L * 60L * 1000L; + // Default COOLOFF Period for the action (1 DAY) + private static final long DEAFULT_COOLOFF_PERIOD_IN_MILLIS = 24L * 60L * 60L * 1000L; + private static final String HEAP_THRESHOLD_STR = "heap_usage"; + private static final String SHARD_DIMENSION_STR = "SHARD"; + private static final String TASK_DIMENSION_STR = "TASK"; + private static final double DEFAULT_HEAP_CHANGE_IN_PERCENTAGE = 5.0; private static final Path SEARCHBP_DATA_FILE_PATH = Paths.get(RcaConsts.CONFIG_DIR_PATH, "SearchBackPressurePolicy_heap"); /* Specify a path to store SearchBackpressurePolicy_Autotune Stats */ - /* TO DO: Check which settings should be modifed based on search heap shard/task cancellation stats */ private AppContext appContext; private RcaConf rcaConf; private SearchBackPressurePolicyConfig policyConfig; @@ -58,23 +61,36 @@ public class SearchBackPressurePolicy implements DecisionPolicy { private int hourlyAlarmThreshold; /* Alarm for heap usage */ - static final List HEAP_SEARCHBP_SIGNALS = - Lists.newArrayList(SEARCHBACKPRESSURE_SHARD, SEARCHBACKPRESSURE_TASK); + static final List HEAP_SEARCHBP_SHARD_SIGNALS = + Lists.newArrayList(SEARCHBACKPRESSURE_SHARD); + static final List HEAP_SEARCHBP_TASK_SIGNALS = + Lists.newArrayList(SEARCHBACKPRESSURE_TASK); - /* alarm monitor per threshold per increase/decrease */ - @VisibleForTesting SearchBpActionsAlarmMonitor searchBackPressureHeapIncreaseAlarm; - @VisibleForTesting SearchBpActionsAlarmMonitor searchBackPressureHeapDecreaseAlarm; + /* alarm monitors per threshold */ + // shard-level alarms + @VisibleForTesting SearchBpActionsAlarmMonitor searchBackPressureShardHeapIncreaseAlarm; + @VisibleForTesting SearchBpActionsAlarmMonitor searchBackPressureShardHeapDecreaseAlarm; + + // task-level alarms + @VisibleForTesting SearchBpActionsAlarmMonitor searchBackPressureTaskHeapIncreaseAlarm; + @VisibleForTesting SearchBpActionsAlarmMonitor searchBackPressureTaskHeapDecreaseAlarm; public SearchBackPressurePolicy( SearchBackPressureClusterRCA searchBackPressureClusterRCA, - SearchBpActionsAlarmMonitor searchBackPressureHeapIncreaseAlarm) { + SearchBpActionsAlarmMonitor searchBackPressureShardHeapIncreaseAlarm, + SearchBpActionsAlarmMonitor searchBackPressureShardHeapDecreaseAlarm, + SearchBpActionsAlarmMonitor searchBackPressureTaskHeapIncreaseAlarm, + SearchBpActionsAlarmMonitor searchBackPressureTaskHeapDecreaseAlarm) { this.searchBackPressureClusterRCA = searchBackPressureClusterRCA; - this.searchBackPressureHeapIncreaseAlarm = searchBackPressureHeapIncreaseAlarm; - LOG.info("SearchBackPressurePolicy#SearchBackPressurePolicy() initialize"); + this.searchBackPressureShardHeapIncreaseAlarm = searchBackPressureShardHeapIncreaseAlarm; + this.searchBackPressureShardHeapDecreaseAlarm = searchBackPressureShardHeapDecreaseAlarm; + this.searchBackPressureTaskHeapIncreaseAlarm = searchBackPressureTaskHeapIncreaseAlarm; + this.searchBackPressureTaskHeapDecreaseAlarm = searchBackPressureTaskHeapDecreaseAlarm; + LOG.info("SearchBackPressurePolicy#SearchBackPressurePolicy() initialized"); } public SearchBackPressurePolicy(SearchBackPressureClusterRCA searchBackPressureClusterRCA) { - this(searchBackPressureClusterRCA, null); + this(searchBackPressureClusterRCA, null, null, null, null); } /** @@ -84,10 +100,34 @@ public SearchBackPressurePolicy(SearchBackPressureClusterRCA searchBackPressureC */ private void record(HotResourceSummary issue) { LOG.info("SearchBackPressurePolicy#record()"); - if (HEAP_SEARCHBP_SIGNALS.contains(issue.getResource())) { - LOG.info( - "Recording issue in searchBackPressureHeapIncreaseAlarm since Resource Searchbp Shard or Task appears"); - searchBackPressureHeapIncreaseAlarm.recordIssue(); + if (HEAP_SEARCHBP_SHARD_SIGNALS.contains(issue.getResource())) { + LOG.info("Recording shard-level issue"); + // increase alarm for heap-related threshold (shard-level) + if (issue.getMetaData() == SearchBackPressureRcaConfig.INCREASE_THRESHOLD_BY_JVM_STR) { + LOG.info("recording increase-level issue for shard"); + searchBackPressureShardHeapIncreaseAlarm.recordIssue(); + } + + // decrease alarm for heap-related threshold (shard-level) + if (issue.getMetaData() == SearchBackPressureRcaConfig.DECREASE_THRESHOLD_BY_JVM_STR) { + LOG.info("recording decrease-level issue for shard"); + searchBackPressureShardHeapDecreaseAlarm.recordIssue(); + } + + } else if (HEAP_SEARCHBP_TASK_SIGNALS.contains(issue.getResource())) { + LOG.info("Recording Task-Level issue"); + + // increase alarm for heap-related threshold (task-level) + if (issue.getMetaData() == SearchBackPressureRcaConfig.INCREASE_THRESHOLD_BY_JVM_STR) { + LOG.info("recording increase-level issue for task"); + searchBackPressureTaskHeapIncreaseAlarm.recordIssue(); + } + + // decrease alarm for heap-related threshold (task-level) + if (issue.getMetaData() == SearchBackPressureRcaConfig.DECREASE_THRESHOLD_BY_JVM_STR) { + LOG.info("recording decrease-level issue for task"); + searchBackPressureTaskHeapDecreaseAlarm.recordIssue(); + } } } @@ -115,20 +155,25 @@ private void recordIssues() { "SearchBackPressurePolicy#recordIssues() Summary test_counter: " + test_counter); record(summary); - // TO DO: Check if we need to increase or decrease the heap threshold } } } } - /* TO DO: Change the logic of heapThresholdIsTooSmall */ - public boolean heapThresholdIsTooSmall() { - return !searchBackPressureHeapIncreaseAlarm.isHealthy(); + public boolean shardHeapThresholdIsTooSmall() { + return !searchBackPressureShardHeapIncreaseAlarm.isHealthy(); } - /* TO DO: Change the logic of heapThresholdIsTooLarge */ - public boolean heapThresholdIsTooLarge() { - return !searchBackPressureHeapIncreaseAlarm.isHealthy(); + public boolean shardHeapThresholdIsTooLarge() { + return !searchBackPressureShardHeapDecreaseAlarm.isHealthy(); + } + + public boolean taskHeapThresholdIsTooSmall() { + return !searchBackPressureTaskHeapIncreaseAlarm.isHealthy(); + } + + public boolean taskHeapThresholdIsTooLarge() { + return !searchBackPressureTaskHeapDecreaseAlarm.isHealthy(); } // create alarm monitor from config @@ -155,13 +200,26 @@ public SearchBpActionsAlarmMonitor createAlarmMonitor(Path persistenceBasePath) // initalize all alarm monitors public void initialize() { LOG.info("Initializing alarms with dummy path"); - if (searchBackPressureHeapIncreaseAlarm == null) { - searchBackPressureHeapIncreaseAlarm = createAlarmMonitor(SEARCHBP_DATA_FILE_PATH); + if (searchBackPressureShardHeapIncreaseAlarm == null) { + searchBackPressureShardHeapIncreaseAlarm = createAlarmMonitor(SEARCHBP_DATA_FILE_PATH); + } + + if (searchBackPressureShardHeapDecreaseAlarm == null) { + searchBackPressureShardHeapDecreaseAlarm = createAlarmMonitor(SEARCHBP_DATA_FILE_PATH); + } + + if (searchBackPressureTaskHeapIncreaseAlarm == null) { + searchBackPressureTaskHeapIncreaseAlarm = createAlarmMonitor(SEARCHBP_DATA_FILE_PATH); + } + + if (searchBackPressureTaskHeapDecreaseAlarm == null) { + searchBackPressureTaskHeapDecreaseAlarm = createAlarmMonitor(SEARCHBP_DATA_FILE_PATH); } } @Override public List evaluate() { + LOG.info("Evaluate() of SearchBackpressurePolicy."); List actions = new ArrayList<>(); if (rcaConf == null || appContext == null) { LOG.error("rca conf/app context is null, return empty action list"); @@ -173,53 +231,60 @@ public List evaluate() { LOG.info("SearchBackPressurePolicy is disabled"); return actions; } - LOG.info("Evaluate() of SearchBackpressurePolicy."); initialize(); LOG.info( - "searchBackPressureHeapIncreaseAlarm#hour breach threshold is {}", - searchBackPressureHeapIncreaseAlarm.getHourBreachThreshold()); + "searchBackPressureShardHeapIncreaseAlarm#hour breach threshold is {}", + searchBackPressureShardHeapIncreaseAlarm.getHourBreachThreshold()); recordIssues(); - if (heapThresholdIsTooSmall()) { - LOG.info( - "SearchBackPressurePolicy#evaluate() heap usage need to be autotuned. raise heap suage threshold action Added!"); + if (shardHeapThresholdIsTooSmall()) { + LOG.info("shardHeapThresholdIsTooSmall action Added!"); // suggest the downstream cls to modify heap usgae threshold actions.add( new SearchBackPressureAction( appContext, true, DEAFULT_COOLOFF_PERIOD_IN_MILLIS, - "heap_usage", - 75.0, - 70)); - } - - if (heapThresholdIsTooLarge()) { - LOG.info( - "SearchBackPressurePolicy#evaluate() heap usage need to be autotuned. drop heap suage threshold action Added!"); + HEAP_THRESHOLD_STR, + SHARD_DIMENSION_STR, + DEFAULT_HEAP_CHANGE_IN_PERCENTAGE)); + } else if (shardHeapThresholdIsTooLarge()) { + LOG.info("shardHeapThresholdIsTooLarge action Added!"); // suggest the downstream cls to modify heap usgae threshold actions.add( new SearchBackPressureAction( appContext, true, DEAFULT_COOLOFF_PERIOD_IN_MILLIS, - "heap_usage", - 65.0, - 70)); + HEAP_THRESHOLD_STR, + SHARD_DIMENSION_STR, + DEFAULT_HEAP_CHANGE_IN_PERCENTAGE)); + } else if (taskHeapThresholdIsTooSmall()) { + LOG.info("taskHeapThresholdIsTooSmall action Added!"); + // suggest the downstream cls to modify heap usgae threshold + actions.add( + new SearchBackPressureAction( + appContext, + true, + DEAFULT_COOLOFF_PERIOD_IN_MILLIS, + HEAP_THRESHOLD_STR, + TASK_DIMENSION_STR, + DEFAULT_HEAP_CHANGE_IN_PERCENTAGE)); + } else if (taskHeapThresholdIsTooLarge()) { + LOG.info("taskHeapThresholdIsTooLarge action Added!"); + // suggest the downstream cls to modify heap usgae threshold + actions.add( + new SearchBackPressureAction( + appContext, + true, + DEAFULT_COOLOFF_PERIOD_IN_MILLIS, + HEAP_THRESHOLD_STR, + TASK_DIMENSION_STR, + DEFAULT_HEAP_CHANGE_IN_PERCENTAGE)); } - // else if (youngGenerationIsTooSmall()) { - // LOG.debug("The young generation is too small!"); - // int newRatio = computeIncrease(getCurrentRatio()); - // if (newRatio >= 1) { - // LOG.debug("Adding new JvmGenAction with ratio {}", newRatio); - // actions.add(new JvmGenAction(appContext, newRatio, COOLOFF_PERIOD_IN_MILLIS, - // true)); - // } - // } - return actions; } diff --git a/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/searchbackpressure/SearchBpActionsAlarmMonitor.java b/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/searchbackpressure/SearchBpActionsAlarmMonitor.java index e53d689f8..1b7939318 100644 --- a/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/searchbackpressure/SearchBpActionsAlarmMonitor.java +++ b/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/searchbackpressure/SearchBpActionsAlarmMonitor.java @@ -24,7 +24,7 @@ public class SearchBpActionsAlarmMonitor implements AlarmMonitor { * if there are more than 30 bad units in one hour, then the alarm shows a Unhealthy Signal * TODO: Remove 2 for testing, replace with 30 */ - private static final int DEFAULT_HOUR_BREACH_THRESHOLD = 2; + private static final int DEFAULT_HOUR_BREACH_THRESHOLD = 3; private static final int DEFAULT_BUCKET_WINDOW_SIZE = 1; // private static final int DEFAULT_DAY_BREACH_THRESHOLD = 1; // private static final int DEFAULT_WEEK_BREACH_THRESHOLD = 1; From 1d9e821cdecb627cabf8461c8a404f367cfebe69 Mon Sep 17 00:00:00 2001 From: CoderJeffrey Date: Fri, 14 Jul 2023 15:17:29 -0700 Subject: [PATCH 46/73] Generic Framework can generate specific actions and read from config (Signed-off-by: Jeffrey Liu ujeffliu@amazon.com) Signed-off-by: CoderJeffrey --- .../actions/SearchBackPressureAction.java | 8 ++------ .../SearchBackPressurePolicyConfig.java | 17 +++++++++++++++++ .../SearchBackPressurePolicy.java | 14 +++++++------- 3 files changed, 26 insertions(+), 13 deletions(-) diff --git a/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/actions/SearchBackPressureAction.java b/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/actions/SearchBackPressureAction.java index 7d6275726..a21195919 100644 --- a/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/actions/SearchBackPressureAction.java +++ b/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/actions/SearchBackPressureAction.java @@ -24,9 +24,8 @@ public class SearchBackPressureAction extends SuppressibleAction { public static final String NAME = "SearchBackPressureAction"; private static final ImpactVector NO_IMPACT = new ImpactVector(); - /* TO DO: Discuss the default cool off period for SearchBackPressureAction + /* * Time to wait since last recommendation, before suggesting this action again - * Needs the action config to have the cool off period for all dimension */ private static final long DEFAULT_COOL_OFF_PERIOD_IN_MILLIS = TimeUnit.DAYS.toMillis(1); @@ -89,9 +88,6 @@ public List impactedNodes() { public Map impact() { Map impact = new HashMap<>(); for (NodeKey key : impactedNodes()) { - // Since SearchBackPressure Service only modify the threshold rather than general - // resources like CPU/Heap - // So there is no impact on the dimensions impact.put(key, NO_IMPACT); } return impact; @@ -166,7 +162,7 @@ public SearchBackPressureAction build() { /* Write Static Class Summary to conver the Searchbp Action POJO to JSON Object * Key fields to be included - * 1. ThresholdName of the Searchbp setting to be modified + * 1. ThresholdName: name of the SearchBackPressure threshold to be tuned * 2. Dimension of the action (Shard/Task) * 3. StepSizeInPercentage to change the threshold * 4. CoolOffPeriodInMillis for the action diff --git a/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/configs/searchbackpressure/SearchBackPressurePolicyConfig.java b/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/configs/searchbackpressure/SearchBackPressurePolicyConfig.java index ea10fb2c6..0745f156a 100644 --- a/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/configs/searchbackpressure/SearchBackPressurePolicyConfig.java +++ b/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/configs/searchbackpressure/SearchBackPressurePolicyConfig.java @@ -40,6 +40,8 @@ public class SearchBackPressurePolicyConfig { "hour-monitor-window-size-minutes"; private static final String HOUR_MONITOR_BUCKET_SIZE_MINUTES = "hour-monitor-bucket-size-minutes"; + private static final String SEARCHBP_HEAP_STEPSIZE_IN_PERCENTAGE = + "searchbp-heap-stepsize-in-percentage"; // Default values public static final boolean DEFAULT_ENABLED = true; @@ -48,11 +50,13 @@ public class SearchBackPressurePolicyConfig { public static final int DEFAULT_HOUR_MONITOR_WINDOW_SIZE_MINUTES = (int) TimeUnit.HOURS.toMinutes(1); public static final int DEFAULT_HOUR_MONITOR_BUCKET_SIZE_MINUTES = 5; + public static final double DEFAULT_SEARCHBP_HEAP_STEPSIZE_IN_PERCENTAGE = 5; private Config hourBreachThreshold; private Config enabled; private Config hourMonitorWindowSizeMinutes; private Config hourMonitorBucketSizeMinutes; + private Config searchbpHeapStepsizeInPercentage; public SearchBackPressurePolicyConfig(NestedConfig config) { enabled = new Config<>(ENABLED, config.getValue(), DEFAULT_ENABLED, Boolean.class); @@ -78,6 +82,15 @@ public SearchBackPressurePolicyConfig(NestedConfig config) { config.getValue(), DEFAULT_HOUR_MONITOR_BUCKET_SIZE_MINUTES, Integer.class); + searchbpHeapStepsizeInPercentage = + new Config<>( + SEARCHBP_HEAP_STEPSIZE_IN_PERCENTAGE, + config.getValue(), + DEFAULT_SEARCHBP_HEAP_STEPSIZE_IN_PERCENTAGE, + Double.class); + LOG.info( + "searchbpHeapStepsizeInPercentage is {}", + searchbpHeapStepsizeInPercentage.getValue()); } /** @@ -100,4 +113,8 @@ public int getHourMonitorWindowSizeMinutes() { public int getHourMonitorBucketSizeMinutes() { return hourMonitorBucketSizeMinutes.getValue(); } + + public double getSearchbpHeapStepsizeInPercentage() { + return searchbpHeapStepsizeInPercentage.getValue(); + } } diff --git a/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/searchbackpressure/SearchBackPressurePolicy.java b/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/searchbackpressure/SearchBackPressurePolicy.java index bb7a2523b..304b2f5de 100644 --- a/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/searchbackpressure/SearchBackPressurePolicy.java +++ b/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/searchbackpressure/SearchBackPressurePolicy.java @@ -181,10 +181,11 @@ public SearchBpActionsAlarmMonitor createAlarmMonitor(Path persistenceBasePath) // LOG the policyConfig.getHourMonitorWindowSizeMinutes() BuketSize and dahy breanch // threhsold LOG.info( - "createAlarmMonitor with hour window: {}, bucket size: {}, hour threshold: {}", + "createAlarmMonitor with hour window: {}, bucket size: {}, hour threshold: {}, stepsize: {}", policyConfig.getHourMonitorWindowSizeMinutes(), policyConfig.getHourMonitorBucketSizeMinutes(), - policyConfig.getHourBreachThreshold()); + policyConfig.getHourBreachThreshold(), + policyConfig.getSearchbpHeapStepsizeInPercentage()); BucketizedSlidingWindowConfig hourMonitorConfig = new BucketizedSlidingWindowConfig( policyConfig.getHourMonitorWindowSizeMinutes(), @@ -193,7 +194,6 @@ public SearchBpActionsAlarmMonitor createAlarmMonitor(Path persistenceBasePath) persistenceBasePath); // TODO: Check whether we need a persistence path to write our data - // return new SearchBpActionsAlarmMonitor(policyConfig.getHourBreachThreshold()); } @@ -249,7 +249,7 @@ public List evaluate() { DEAFULT_COOLOFF_PERIOD_IN_MILLIS, HEAP_THRESHOLD_STR, SHARD_DIMENSION_STR, - DEFAULT_HEAP_CHANGE_IN_PERCENTAGE)); + policyConfig.getSearchbpHeapStepsizeInPercentage())); } else if (shardHeapThresholdIsTooLarge()) { LOG.info("shardHeapThresholdIsTooLarge action Added!"); // suggest the downstream cls to modify heap usgae threshold @@ -260,7 +260,7 @@ public List evaluate() { DEAFULT_COOLOFF_PERIOD_IN_MILLIS, HEAP_THRESHOLD_STR, SHARD_DIMENSION_STR, - DEFAULT_HEAP_CHANGE_IN_PERCENTAGE)); + policyConfig.getSearchbpHeapStepsizeInPercentage())); } else if (taskHeapThresholdIsTooSmall()) { LOG.info("taskHeapThresholdIsTooSmall action Added!"); // suggest the downstream cls to modify heap usgae threshold @@ -271,7 +271,7 @@ public List evaluate() { DEAFULT_COOLOFF_PERIOD_IN_MILLIS, HEAP_THRESHOLD_STR, TASK_DIMENSION_STR, - DEFAULT_HEAP_CHANGE_IN_PERCENTAGE)); + policyConfig.getSearchbpHeapStepsizeInPercentage())); } else if (taskHeapThresholdIsTooLarge()) { LOG.info("taskHeapThresholdIsTooLarge action Added!"); // suggest the downstream cls to modify heap usgae threshold @@ -282,7 +282,7 @@ public List evaluate() { DEAFULT_COOLOFF_PERIOD_IN_MILLIS, HEAP_THRESHOLD_STR, TASK_DIMENSION_STR, - DEFAULT_HEAP_CHANGE_IN_PERCENTAGE)); + policyConfig.getSearchbpHeapStepsizeInPercentage())); } return actions; From d6558ede437c328cbd0d9210e94e23f73fc0f9ba Mon Sep 17 00:00:00 2001 From: CoderJeffrey Date: Fri, 14 Jul 2023 15:44:36 -0700 Subject: [PATCH 47/73] removed dead comment for SearchBpActionConfig.java (Signed-off-by: Jeffrey Liu ujeffliu@amazon.com) Signed-off-by: CoderJeffrey --- .../SearchBpActionsAlarmMonitor.java | 59 ++----------------- 1 file changed, 6 insertions(+), 53 deletions(-) diff --git a/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/searchbackpressure/SearchBpActionsAlarmMonitor.java b/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/searchbackpressure/SearchBpActionsAlarmMonitor.java index 1b7939318..ce13ccbf0 100644 --- a/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/searchbackpressure/SearchBpActionsAlarmMonitor.java +++ b/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/searchbackpressure/SearchBpActionsAlarmMonitor.java @@ -20,29 +20,19 @@ public class SearchBpActionsAlarmMonitor implements AlarmMonitor { private static final Logger LOG = LogManager.getLogger(SearchBpActionsAlarmMonitor.class); - /* Current design only uses hour monitor to evaluate the health of the searchbackpressure service - * if there are more than 30 bad units in one hour, then the alarm shows a Unhealthy Signal - * TODO: Remove 2 for testing, replace with 30 + /* Current design uses hour monitor to evaluate the health of the searchbackpressure service + * if there are more than 30 bad resournce units in one hour, then the alarm shows a Unhealthy Signal */ + + // TODO: Remove 3 for testing, replace with 30 private static final int DEFAULT_HOUR_BREACH_THRESHOLD = 3; private static final int DEFAULT_BUCKET_WINDOW_SIZE = 1; - // private static final int DEFAULT_DAY_BREACH_THRESHOLD = 1; - // private static final int DEFAULT_WEEK_BREACH_THRESHOLD = 1; private static final String HOUR_PREFIX = "hour-"; - // private static final String DAY_PREFIX = "day-"; - // private static final String WEEK_PREFIX = "week-"; public static final int HOUR_MONITOR_BUCKET_WINDOW_MINUTES = 5; - // public static final int DAY_MONITOR_BUCKET_WINDOW_MINUTES = 30; - // public static final int WEEK_MONITOR_BUCKET_WINDOW_MINUTES = 86400; private BucketizedSlidingWindow hourMonitor; - // private BucketizedSlidingWindow dayMonitor; - // private BucketizedSlidingWindow weekMonitor; - private int hourBreachThreshold; - // private int dayBreachThreshold; - // private int weekBreachThreshold; private boolean alarmHealthy = true; @@ -65,13 +55,9 @@ public SearchBpActionsAlarmMonitor( Paths.get( persistenceBase.toString(), HOUR_PREFIX + persistenceFile.toString()); - // weekMonitorPath = - // Paths.get( - // persistenceBase.toString(), - // WEEK_PREFIX + persistenceFile.toString()); } } - // initialize hour Monitor + // initialize hourly alarm monitor if (hourMonitorConfig == null) { /* * Bucket Window Size means the number of issues can exist in a bucket @@ -87,15 +73,8 @@ public SearchBpActionsAlarmMonitor( } else { hourMonitor = new BucketizedSlidingWindow(hourMonitorConfig); } - // // initialize weekMonitor - // if (weekMonitorConfig == null) { - // weekMonitor = new BucketizedSlidingWindow(4, 1, TimeUnit.DAYS, weekMonitorPath); - // } else { - // weekMonitor = new BucketizedSlidingWindow(weekMonitorConfig); - // } this.hourBreachThreshold = hourBreachThreshold; - // this.weekBreachThreshold = weekBreachThreshold; } public SearchBpActionsAlarmMonitor(int hourBreachThreshold, @Nullable Path persistencePath) { @@ -119,17 +98,10 @@ public void recordIssue(long timeStamp, double value) { SlidingWindowData dataPoint = new SlidingWindowData(timeStamp, value); LOG.info("Search Backpressure Actions Alarm is recording a new issue at {}", timeStamp); hourMonitor.next(dataPoint); - - // // If we've breached the hour threshold, record it as a bad day/ - // if (hourMonitor.size() >= hourBreachThreshold) { - // dayMonitor.next(new SlidingWindowData(dataPoint.getTimeStamp(), - // dataPoint.getValue())); - // } } private void evaluateAlarm() { if (alarmHealthy) { - LOG.info("Alarm healthy with hourMonitor.size() = {}", hourMonitor.size()); if (hourMonitor.size() >= hourBreachThreshold) { LOG.info( "Search Backpressure Actions Alarm is Unhealthy because hourMonitor.size() is {}, and threshold is {}", @@ -138,9 +110,8 @@ private void evaluateAlarm() { alarmHealthy = false; } } else { - LOG.info("Alarm not healthy"); if (hourMonitor.size() == 0) { - LOG.info("SearchBackpressure Hour Monitor is healthy for zero capacity"); + LOG.info("SearchBackpressure Hour Monitor is now healthy for zero capacity"); alarmHealthy = true; } } @@ -150,29 +121,11 @@ public int getHourBreachThreshold() { return hourBreachThreshold; } - // public int getDayBreachThreshold() { - // return dayBreachThreshold; - // } - - // public int getWeekBreachThreshold() { - // return weekBreachThreshold; - // } - @VisibleForTesting BucketizedSlidingWindow getHourMonitor() { return hourMonitor; } - // @VisibleForTesting - // BucketizedSlidingWindow getDayMonitor() { - // return dayMonitor; - // } - - // @VisibleForTesting - // BucketizedSlidingWindow getWeekMonitor() { - // return weekMonitor; - // } - @VisibleForTesting void setAlarmHealth(boolean isHealthy) { this.alarmHealthy = isHealthy; From e3bef947a5ade876e3b32534e457383714f56119 Mon Sep 17 00:00:00 2001 From: CoderJeffrey Date: Fri, 14 Jul 2023 15:52:30 -0700 Subject: [PATCH 48/73] removed dead comment for action/polict (Signed-off-by: Jeffrey Liu ujeffliu@amazon.com) Signed-off-by: CoderJeffrey --- .../decisionmaker/actions/SearchBackPressureAction.java | 2 +- .../deciders/searchbackpressure/SearchBackPressurePolicy.java | 4 ++++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/actions/SearchBackPressureAction.java b/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/actions/SearchBackPressureAction.java index a21195919..4e0f19b32 100644 --- a/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/actions/SearchBackPressureAction.java +++ b/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/actions/SearchBackPressureAction.java @@ -33,7 +33,7 @@ public class SearchBackPressureAction extends SuppressibleAction { * canUpdate: whether the action should be emitted * coolOffPeriodInMillis: how long the CoolOffPeriod the action should before reemit * thresholdname: the name of threshold we are tuning (e.g. node_duress.cpu_threshold, search_heap_threshold) - * dimension: shard/task (String) + * dimension: indicates whether the resource unit is caused by shard/task level searchbackpressure cancellation stats * Step Size in percentage: how much should the threshold change in percentage */ private boolean canUpdate; diff --git a/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/searchbackpressure/SearchBackPressurePolicy.java b/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/searchbackpressure/SearchBackPressurePolicy.java index 304b2f5de..7ec66545c 100644 --- a/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/searchbackpressure/SearchBackPressurePolicy.java +++ b/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/searchbackpressure/SearchBackPressurePolicy.java @@ -200,18 +200,22 @@ public SearchBpActionsAlarmMonitor createAlarmMonitor(Path persistenceBasePath) // initalize all alarm monitors public void initialize() { LOG.info("Initializing alarms with dummy path"); + // initialize shard level alarm for resounce unit that suggests to increase jvm threshold if (searchBackPressureShardHeapIncreaseAlarm == null) { searchBackPressureShardHeapIncreaseAlarm = createAlarmMonitor(SEARCHBP_DATA_FILE_PATH); } + // initialize shard level alarm for resounce unit that suggests to decrease jvm threshold if (searchBackPressureShardHeapDecreaseAlarm == null) { searchBackPressureShardHeapDecreaseAlarm = createAlarmMonitor(SEARCHBP_DATA_FILE_PATH); } + // initialize task level alarm for resounce unit that suggests to increase jvm threshold if (searchBackPressureTaskHeapIncreaseAlarm == null) { searchBackPressureTaskHeapIncreaseAlarm = createAlarmMonitor(SEARCHBP_DATA_FILE_PATH); } + // initialize task level alarm for resounce unit that suggests to decrease jvm threhsold if (searchBackPressureTaskHeapDecreaseAlarm == null) { searchBackPressureTaskHeapDecreaseAlarm = createAlarmMonitor(SEARCHBP_DATA_FILE_PATH); } From b0c7f02616ea00187c8f538cad2fea5d6a2ff9bb Mon Sep 17 00:00:00 2001 From: CoderJeffrey Date: Fri, 14 Jul 2023 16:03:33 -0700 Subject: [PATCH 49/73] removed dead comment for action/policy (Signed-off-by: Jeffrey Liu ujeffliu@amazon.com) Signed-off-by: CoderJeffrey --- .../actions/SearchBackPressureAction.java | 2 +- .../searchbackpressure/SearchBackPressurePolicy.java | 11 ++--------- 2 files changed, 3 insertions(+), 10 deletions(-) diff --git a/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/actions/SearchBackPressureAction.java b/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/actions/SearchBackPressureAction.java index 4e0f19b32..cb0abcfee 100644 --- a/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/actions/SearchBackPressureAction.java +++ b/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/actions/SearchBackPressureAction.java @@ -32,7 +32,7 @@ public class SearchBackPressureAction extends SuppressibleAction { /* From Config Per Diumension Type * canUpdate: whether the action should be emitted * coolOffPeriodInMillis: how long the CoolOffPeriod the action should before reemit - * thresholdname: the name of threshold we are tuning (e.g. node_duress.cpu_threshold, search_heap_threshold) + * thresholdName: the name of threshold we are tuning (e.g. node_duress.cpu_threshold, search_heap_threshold) * dimension: indicates whether the resource unit is caused by shard/task level searchbackpressure cancellation stats * Step Size in percentage: how much should the threshold change in percentage */ diff --git a/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/searchbackpressure/SearchBackPressurePolicy.java b/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/searchbackpressure/SearchBackPressurePolicy.java index 7ec66545c..01370fa6f 100644 --- a/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/searchbackpressure/SearchBackPressurePolicy.java +++ b/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/searchbackpressure/SearchBackPressurePolicy.java @@ -50,16 +50,13 @@ public class SearchBackPressurePolicy implements DecisionPolicy { private static final Path SEARCHBP_DATA_FILE_PATH = Paths.get(RcaConsts.CONFIG_DIR_PATH, "SearchBackPressurePolicy_heap"); - /* Specify a path to store SearchBackpressurePolicy_Autotune Stats */ + /* TO DO: Specify a path to store SearchBackpressurePolicy_Autotune Stats */ private AppContext appContext; private RcaConf rcaConf; private SearchBackPressurePolicyConfig policyConfig; private SearchBackPressureClusterRCA searchBackPressureClusterRCA; - /* Hourly Alarm frequency threshold */ - private int hourlyAlarmThreshold; - /* Alarm for heap usage */ static final List HEAP_SEARCHBP_SHARD_SIGNALS = Lists.newArrayList(SEARCHBACKPRESSURE_SHARD); @@ -223,7 +220,7 @@ public void initialize() { @Override public List evaluate() { - LOG.info("Evaluate() of SearchBackpressurePolicy."); + LOG.info("Evaluate() of SearchBackpressurePolicy started"); List actions = new ArrayList<>(); if (rcaConf == null || appContext == null) { LOG.error("rca conf/app context is null, return empty action list"); @@ -245,7 +242,6 @@ public List evaluate() { if (shardHeapThresholdIsTooSmall()) { LOG.info("shardHeapThresholdIsTooSmall action Added!"); - // suggest the downstream cls to modify heap usgae threshold actions.add( new SearchBackPressureAction( appContext, @@ -256,7 +252,6 @@ public List evaluate() { policyConfig.getSearchbpHeapStepsizeInPercentage())); } else if (shardHeapThresholdIsTooLarge()) { LOG.info("shardHeapThresholdIsTooLarge action Added!"); - // suggest the downstream cls to modify heap usgae threshold actions.add( new SearchBackPressureAction( appContext, @@ -267,7 +262,6 @@ public List evaluate() { policyConfig.getSearchbpHeapStepsizeInPercentage())); } else if (taskHeapThresholdIsTooSmall()) { LOG.info("taskHeapThresholdIsTooSmall action Added!"); - // suggest the downstream cls to modify heap usgae threshold actions.add( new SearchBackPressureAction( appContext, @@ -278,7 +272,6 @@ public List evaluate() { policyConfig.getSearchbpHeapStepsizeInPercentage())); } else if (taskHeapThresholdIsTooLarge()) { LOG.info("taskHeapThresholdIsTooLarge action Added!"); - // suggest the downstream cls to modify heap usgae threshold actions.add( new SearchBackPressureAction( appContext, From b6398fdc4d5878c4f521748c54a1944536a655e7 Mon Sep 17 00:00:00 2001 From: CoderJeffrey Date: Mon, 17 Jul 2023 15:08:58 -0700 Subject: [PATCH 50/73] Add increase/decrease direction for ActionPojo (Signed-off-by: Jeffrey Liu ujeffliu@amazon.com) Signed-off-by: CoderJeffrey --- .../actions/SearchBackPressureAction.java | 45 +++++++++++++++++++ .../SearchBackPressureDecider.java | 14 ++++++ .../SearchBackPressurePolicy.java | 8 ++++ 3 files changed, 67 insertions(+) diff --git a/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/actions/SearchBackPressureAction.java b/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/actions/SearchBackPressureAction.java index cb0abcfee..62b53588b 100644 --- a/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/actions/SearchBackPressureAction.java +++ b/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/actions/SearchBackPressureAction.java @@ -40,6 +40,7 @@ public class SearchBackPressureAction extends SuppressibleAction { private long coolOffPeriodInMillis; private String thresholdName; private String dimension; + private String direction; private double stepSizeInPercentage; public SearchBackPressureAction( @@ -48,12 +49,14 @@ public SearchBackPressureAction( final long coolOffPeriodInMillis, final String thresholdName, final String dimension, + final String direction, final double stepSizeInPercentage) { super(appContext); this.canUpdate = canUpdate; this.coolOffPeriodInMillis = coolOffPeriodInMillis; this.thresholdName = thresholdName; this.dimension = dimension; + this.direction = direction; this.stepSizeInPercentage = stepSizeInPercentage; } @@ -101,6 +104,10 @@ public String getDimension() { return dimension; } + public String getDirection() { + return direction; + } + public double getStepSizeInPercentage() { return stepSizeInPercentage; } @@ -111,6 +118,7 @@ public String summary() { new Summary( thresholdName, dimension, + direction, stepSizeInPercentage, DEFAULT_COOL_OFF_PERIOD_IN_MILLIS, canUpdate); @@ -123,6 +131,7 @@ public static final class Builder { private final AppContext appContext; private final String thresholdName; private final String dimension; + private final String direction; private boolean canUpdate; private double stepSizeInPercentage; private long coolOffPeriodInMillis; @@ -131,10 +140,12 @@ private Builder( final AppContext appContext, final String thresholdName, final String dimension, + final String direction, final long coolOffPeriodInMillis) { this.appContext = appContext; this.thresholdName = thresholdName; this.dimension = dimension; + this.direction = direction; this.coolOffPeriodInMillis = coolOffPeriodInMillis; this.canUpdate = DEFAULT_CAN_UPDATE; } @@ -156,6 +167,7 @@ public SearchBackPressureAction build() { coolOffPeriodInMillis, thresholdName, dimension, + direction, stepSizeInPercentage); } } @@ -164,6 +176,7 @@ public SearchBackPressureAction build() { * Key fields to be included * 1. ThresholdName: name of the SearchBackPressure threshold to be tuned * 2. Dimension of the action (Shard/Task) + * 3. Direction of the action (Increase/Decrease) * 3. StepSizeInPercentage to change the threshold * 4. CoolOffPeriodInMillis for the action * 5. canUpdate (whether the action should be emitted) @@ -171,6 +184,7 @@ public SearchBackPressureAction build() { public static class Summary { public static final String THRESHOLD_NAME = "thresholdName"; public static final String SEARCHBP_DIMENSION = "searchbpDimension"; + public static final String DIRECTION = "direction"; public static final String STEP_SIZE_IN_PERCENTAGE = "stepSizeInPercentage"; public static final String COOL_OFF_PERIOD = "coolOffPeriodInMillis"; public static final String CAN_UPDATE = "canUpdate"; @@ -181,6 +195,9 @@ public static class Summary { @SerializedName(value = SEARCHBP_DIMENSION) private String searchbpSettingDimension; + @SerializedName(value = DIRECTION) + private String direction; + @SerializedName(value = STEP_SIZE_IN_PERCENTAGE) private double stepSizeInPercentage; @@ -193,11 +210,13 @@ public static class Summary { public Summary( String thresholdName, String searchbpSettingDimension, + String direction, double stepSizeInPercentage, long coolOffPeriodInMillis, boolean canUpdate) { this.thresholdName = thresholdName; this.searchbpSettingDimension = searchbpSettingDimension; + this.direction = direction; this.stepSizeInPercentage = stepSizeInPercentage; this.coolOffPeriodInMillis = coolOffPeriodInMillis; this.canUpdate = canUpdate; @@ -215,6 +234,10 @@ public String getSearchbpSettingDimension() { return searchbpSettingDimension; } + public String getDirection() { + return direction; + } + public double getStepSizeInPercentage() { return stepSizeInPercentage; } @@ -232,4 +255,26 @@ public String toJson() { return gson.toJson(this); } } + + // enum to indicate to increase/decrease the threshold + public enum SearchbpThresholdActionDirection { + INCREASE(SearchbpThresholdActionDirection.Constants.INCREASE_STR), + DECREASE(SearchbpThresholdActionDirection.Constants.DECREASE_STR); + + private final String value; + + SearchbpThresholdActionDirection(String value) { + this.value = value; + } + + @Override + public String toString() { + return value; + } + + public static class Constants { + public static final String INCREASE_STR = "increase"; + public static final String DECREASE_STR = "decrease"; + } + } } diff --git a/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/searchbackpressure/SearchBackPressureDecider.java b/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/searchbackpressure/SearchBackPressureDecider.java index a053dc845..ebe7b6986 100644 --- a/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/searchbackpressure/SearchBackPressureDecider.java +++ b/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/searchbackpressure/SearchBackPressureDecider.java @@ -11,6 +11,7 @@ import org.apache.logging.log4j.Logger; import org.opensearch.performanceanalyzer.AppContext; import org.opensearch.performanceanalyzer.decisionmaker.actions.Action; +import org.opensearch.performanceanalyzer.decisionmaker.actions.SearchBackPressureAction; import org.opensearch.performanceanalyzer.decisionmaker.deciders.Decider; import org.opensearch.performanceanalyzer.decisionmaker.deciders.Decision; import org.opensearch.performanceanalyzer.rca.framework.core.RcaConf; @@ -58,6 +59,19 @@ public Decision operate() { // SearchBackPressure Policy is always accepted List searchBackPressureActions = searchBackPressurePolicy.evaluate(); + // loop through the actions and print the action threshold name, dimension, + // increase/decrease + for (int i = 0; i < searchBackPressureActions.size(); i++) { + LOG.info( + "Action details, threshold name: {}, dimension: {}, increase/decrease: {}, stepsize: {}", + ((SearchBackPressureAction) searchBackPressureActions.get(i)) + .getThresholdName(), + ((SearchBackPressureAction) searchBackPressureActions.get(i)).getDimension(), + ((SearchBackPressureAction) searchBackPressureActions.get(i)).getDirection(), + ((SearchBackPressureAction) searchBackPressureActions.get(i)) + .getStepSizeInPercentage()); + } + searchBackPressureActions.forEach(decision::addAction); LOG.info("decision action size is {}", decision.getActions().size()); diff --git a/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/searchbackpressure/SearchBackPressurePolicy.java b/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/searchbackpressure/SearchBackPressurePolicy.java index 01370fa6f..f68729421 100644 --- a/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/searchbackpressure/SearchBackPressurePolicy.java +++ b/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/searchbackpressure/SearchBackPressurePolicy.java @@ -249,6 +249,8 @@ public List evaluate() { DEAFULT_COOLOFF_PERIOD_IN_MILLIS, HEAP_THRESHOLD_STR, SHARD_DIMENSION_STR, + SearchBackPressureAction.SearchbpThresholdActionDirection.INCREASE + .toString(), policyConfig.getSearchbpHeapStepsizeInPercentage())); } else if (shardHeapThresholdIsTooLarge()) { LOG.info("shardHeapThresholdIsTooLarge action Added!"); @@ -259,6 +261,8 @@ public List evaluate() { DEAFULT_COOLOFF_PERIOD_IN_MILLIS, HEAP_THRESHOLD_STR, SHARD_DIMENSION_STR, + SearchBackPressureAction.SearchbpThresholdActionDirection.DECREASE + .toString(), policyConfig.getSearchbpHeapStepsizeInPercentage())); } else if (taskHeapThresholdIsTooSmall()) { LOG.info("taskHeapThresholdIsTooSmall action Added!"); @@ -269,6 +273,8 @@ public List evaluate() { DEAFULT_COOLOFF_PERIOD_IN_MILLIS, HEAP_THRESHOLD_STR, TASK_DIMENSION_STR, + SearchBackPressureAction.SearchbpThresholdActionDirection.INCREASE + .toString(), policyConfig.getSearchbpHeapStepsizeInPercentage())); } else if (taskHeapThresholdIsTooLarge()) { LOG.info("taskHeapThresholdIsTooLarge action Added!"); @@ -279,6 +285,8 @@ public List evaluate() { DEAFULT_COOLOFF_PERIOD_IN_MILLIS, HEAP_THRESHOLD_STR, TASK_DIMENSION_STR, + SearchBackPressureAction.SearchbpThresholdActionDirection.DECREASE + .toString(), policyConfig.getSearchbpHeapStepsizeInPercentage())); } From 2d405166124182468ea19d8057b89edbfe15984b Mon Sep 17 00:00:00 2001 From: CoderJeffrey Date: Wed, 19 Jul 2023 09:54:49 -0700 Subject: [PATCH 51/73] remove trailing (Signed-off-by: Jeffrey Liu ujeffliu@amazon.com) Signed-off-by: CoderJeffrey --- build.gradle | 1 + 1 file changed, 1 insertion(+) diff --git a/build.gradle b/build.gradle index 25b70ea22..0d4fbd466 100644 --- a/build.gradle +++ b/build.gradle @@ -373,6 +373,7 @@ dependencies { strictly "2.23.0" } } + testImplementation group: 'org.powermock', name: 'powermock-core', version: '2.0.0' testImplementation group: 'org.powermock', name: 'powermock-api-support', version: '2.0.0' testImplementation group: 'org.powermock', name: 'powermock-module-junit4-common', version: '2.0.0' From bbcdfa1069fb43901a3628452d5bb01b462dd5e7 Mon Sep 17 00:00:00 2001 From: CoderJeffrey Date: Wed, 19 Jul 2023 14:12:20 -0700 Subject: [PATCH 52/73] Restore to workable solution Signed-off-by: CoderJeffrey --- .../opensearch/performanceanalyzer/rca/store/rca/OldGenRca.java | 1 - 1 file changed, 1 deletion(-) diff --git a/src/main/java/org/opensearch/performanceanalyzer/rca/store/rca/OldGenRca.java b/src/main/java/org/opensearch/performanceanalyzer/rca/store/rca/OldGenRca.java index 147dd5db1..26624bc11 100644 --- a/src/main/java/org/opensearch/performanceanalyzer/rca/store/rca/OldGenRca.java +++ b/src/main/java/org/opensearch/performanceanalyzer/rca/store/rca/OldGenRca.java @@ -271,7 +271,6 @@ public MinMaxSlidingWindow( @Override public void next(SlidingWindowData e) { - boolean pollFirstCondition; if (isMinSlidingWindow) { // monotonically decreasing sliding window while (!windowDeque.isEmpty() From 11e2241bf5d66cd12351a18e21899cc7c1ea88b0 Mon Sep 17 00:00:00 2001 From: CoderJeffrey Date: Wed, 19 Jul 2023 14:36:29 -0700 Subject: [PATCH 53/73] Restore to workable solution (Signed-off-by: Jeffrey Liu ujeffliu@amazon.com) Signed-off-by: CoderJeffrey --- .../decisionmaker/actions/SearchBackPressureAction.java | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/actions/SearchBackPressureAction.java b/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/actions/SearchBackPressureAction.java index 62b53588b..ddd136e17 100644 --- a/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/actions/SearchBackPressureAction.java +++ b/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/actions/SearchBackPressureAction.java @@ -258,8 +258,8 @@ public String toJson() { // enum to indicate to increase/decrease the threshold public enum SearchbpThresholdActionDirection { - INCREASE(SearchbpThresholdActionDirection.Constants.INCREASE_STR), - DECREASE(SearchbpThresholdActionDirection.Constants.DECREASE_STR); + INCREASE(SearchbpThresholdActionDirection.Constants.INCREASE), + DECREASE(SearchbpThresholdActionDirection.Constants.DECREASE); private final String value; @@ -273,8 +273,8 @@ public String toString() { } public static class Constants { - public static final String INCREASE_STR = "increase"; - public static final String DECREASE_STR = "decrease"; + public static final String INCREASE = "increase"; + public static final String DECREASE = "decrease"; } } } From 2249395cffe496a6d8269509ea7545e13698bd92 Mon Sep 17 00:00:00 2001 From: CoderJeffrey Date: Wed, 19 Jul 2023 14:56:52 -0700 Subject: [PATCH 54/73] remove hourly window size and bucket size (Signed-off-by: Jeffrey Liu ujeffliu@amazon.com) Signed-off-by: CoderJeffrey --- .../SearchBackPressurePolicyConfig.java | 26 +++---------------- 1 file changed, 4 insertions(+), 22 deletions(-) diff --git a/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/configs/searchbackpressure/SearchBackPressurePolicyConfig.java b/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/configs/searchbackpressure/SearchBackPressurePolicyConfig.java index 0745f156a..64e732c63 100644 --- a/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/configs/searchbackpressure/SearchBackPressurePolicyConfig.java +++ b/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/configs/searchbackpressure/SearchBackPressurePolicyConfig.java @@ -36,10 +36,6 @@ public class SearchBackPressurePolicyConfig { private static final String ENABLED = "enabled"; private static final String HOUR_BREACH_THRESHOLD = "hour-breach-threshold"; private static final String THRESHOLD_COUNT = "threshold_count"; - private static final String HOUR_MONITOR_WINDOW_SIZE_MINUTES = - "hour-monitor-window-size-minutes"; - private static final String HOUR_MONITOR_BUCKET_SIZE_MINUTES = - "hour-monitor-bucket-size-minutes"; private static final String SEARCHBP_HEAP_STEPSIZE_IN_PERCENTAGE = "searchbp-heap-stepsize-in-percentage"; @@ -49,13 +45,11 @@ public class SearchBackPressurePolicyConfig { public static final int DEFAULT_HOUR_BREACH_THRESHOLD = 2; public static final int DEFAULT_HOUR_MONITOR_WINDOW_SIZE_MINUTES = (int) TimeUnit.HOURS.toMinutes(1); - public static final int DEFAULT_HOUR_MONITOR_BUCKET_SIZE_MINUTES = 5; + public static final int DEFAULT_HOUR_MONITOR_BUCKET_SIZE_MINUTES = 1; public static final double DEFAULT_SEARCHBP_HEAP_STEPSIZE_IN_PERCENTAGE = 5; private Config hourBreachThreshold; private Config enabled; - private Config hourMonitorWindowSizeMinutes; - private Config hourMonitorBucketSizeMinutes; private Config searchbpHeapStepsizeInPercentage; public SearchBackPressurePolicyConfig(NestedConfig config) { @@ -69,19 +63,7 @@ public SearchBackPressurePolicyConfig(NestedConfig config) { LOG.info( "SearchBackPressurePolicyConfig hour breach threshold is: {}", hourBreachThreshold.getValue()); - hourMonitorWindowSizeMinutes = - new Config<>( - HOUR_MONITOR_WINDOW_SIZE_MINUTES, - config.getValue(), - DEFAULT_HOUR_MONITOR_WINDOW_SIZE_MINUTES, - Integer.class); - LOG.info("hourMonitorWindowSizeMinutes is: {}", hourMonitorWindowSizeMinutes.getValue()); - hourMonitorBucketSizeMinutes = - new Config<>( - HOUR_MONITOR_BUCKET_SIZE_MINUTES, - config.getValue(), - DEFAULT_HOUR_MONITOR_BUCKET_SIZE_MINUTES, - Integer.class); + searchbpHeapStepsizeInPercentage = new Config<>( SEARCHBP_HEAP_STEPSIZE_IN_PERCENTAGE, @@ -107,11 +89,11 @@ public int getHourBreachThreshold() { } public int getHourMonitorWindowSizeMinutes() { - return hourMonitorWindowSizeMinutes.getValue(); + return DEFAULT_HOUR_MONITOR_WINDOW_SIZE_MINUTES; } public int getHourMonitorBucketSizeMinutes() { - return hourMonitorBucketSizeMinutes.getValue(); + return DEFAULT_HOUR_MONITOR_BUCKET_SIZE_MINUTES; } public double getSearchbpHeapStepsizeInPercentage() { From d05f0933ba338b107f678e5c5f03719d59322d0d Mon Sep 17 00:00:00 2001 From: CoderJeffrey Date: Wed, 19 Jul 2023 15:17:46 -0700 Subject: [PATCH 55/73] Add hourMonitorConfig to set up alarm monitor (Signed-off-by: Jeffrey Liu ujeffliu@amazon.com) Signed-off-by: CoderJeffrey --- .../deciders/searchbackpressure/SearchBackPressurePolicy.java | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/searchbackpressure/SearchBackPressurePolicy.java b/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/searchbackpressure/SearchBackPressurePolicy.java index f68729421..a647996cd 100644 --- a/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/searchbackpressure/SearchBackPressurePolicy.java +++ b/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/searchbackpressure/SearchBackPressurePolicy.java @@ -191,7 +191,8 @@ public SearchBpActionsAlarmMonitor createAlarmMonitor(Path persistenceBasePath) persistenceBasePath); // TODO: Check whether we need a persistence path to write our data - return new SearchBpActionsAlarmMonitor(policyConfig.getHourBreachThreshold()); + return new SearchBpActionsAlarmMonitor( + policyConfig.getHourBreachThreshold(), null, hourMonitorConfig); } // initalize all alarm monitors From f60a8d906891ad2032795a7d703a1bbd43374ad8 Mon Sep 17 00:00:00 2001 From: CoderJeffrey Date: Thu, 20 Jul 2023 10:47:03 -0700 Subject: [PATCH 56/73] Remove unused counter (Signed-off-by: Jeffrey Liu ujeffliu@amazon.com) Signed-off-by: CoderJeffrey --- .../SearchBackPressurePolicy.java | 33 ++++++++----------- 1 file changed, 13 insertions(+), 20 deletions(-) diff --git a/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/searchbackpressure/SearchBackPressurePolicy.java b/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/searchbackpressure/SearchBackPressurePolicy.java index a647996cd..3efc9a12f 100644 --- a/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/searchbackpressure/SearchBackPressurePolicy.java +++ b/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/searchbackpressure/SearchBackPressurePolicy.java @@ -137,7 +137,7 @@ private void recordIssues() { "SearchBackPressurePolicy#recordIssues() No flow units in searchBackPressureClusterRCA"); return; } - int test_counter = 0; + for (ResourceFlowUnit flowUnit : searchBackPressureClusterRCA.getFlowUnits()) { if (!flowUnit.hasResourceSummary()) { @@ -147,29 +147,25 @@ private void recordIssues() { HotClusterSummary clusterSummary = flowUnit.getSummary(); for (HotNodeSummary nodeSummary : clusterSummary.getHotNodeSummaryList()) { for (HotResourceSummary summary : nodeSummary.getHotResourceSummaryList()) { - test_counter += 1; - LOG.info( - "SearchBackPressurePolicy#recordIssues() Summary test_counter: " - + test_counter); record(summary); } } } } - public boolean shardHeapThresholdIsTooSmall() { + public boolean isShardHeapThresholdTooSmall() { return !searchBackPressureShardHeapIncreaseAlarm.isHealthy(); } - public boolean shardHeapThresholdIsTooLarge() { + public boolean isShardHeapThresholdTooLarge() { return !searchBackPressureShardHeapDecreaseAlarm.isHealthy(); } - public boolean taskHeapThresholdIsTooSmall() { + public boolean isTaskHeapThresholdTooSmall() { return !searchBackPressureTaskHeapIncreaseAlarm.isHealthy(); } - public boolean taskHeapThresholdIsTooLarge() { + public boolean isTaskHeapThresholdTooLarge() { return !searchBackPressureTaskHeapDecreaseAlarm.isHealthy(); } @@ -235,14 +231,11 @@ public List evaluate() { } initialize(); - LOG.info( - "searchBackPressureShardHeapIncreaseAlarm#hour breach threshold is {}", - searchBackPressureShardHeapIncreaseAlarm.getHourBreachThreshold()); recordIssues(); - if (shardHeapThresholdIsTooSmall()) { - LOG.info("shardHeapThresholdIsTooSmall action Added!"); + if (isShardHeapThresholdTooSmall()) { + LOG.info("isShardHeapThresholdTooSmall action Added!"); actions.add( new SearchBackPressureAction( appContext, @@ -253,8 +246,8 @@ public List evaluate() { SearchBackPressureAction.SearchbpThresholdActionDirection.INCREASE .toString(), policyConfig.getSearchbpHeapStepsizeInPercentage())); - } else if (shardHeapThresholdIsTooLarge()) { - LOG.info("shardHeapThresholdIsTooLarge action Added!"); + } else if (isShardHeapThresholdTooLarge()) { + LOG.info("isShardHeapThresholdTooLarge action Added!"); actions.add( new SearchBackPressureAction( appContext, @@ -265,8 +258,8 @@ public List evaluate() { SearchBackPressureAction.SearchbpThresholdActionDirection.DECREASE .toString(), policyConfig.getSearchbpHeapStepsizeInPercentage())); - } else if (taskHeapThresholdIsTooSmall()) { - LOG.info("taskHeapThresholdIsTooSmall action Added!"); + } else if (isTaskHeapThresholdTooSmall()) { + LOG.info("isTaskHeapThresholdTooSmall action Added!"); actions.add( new SearchBackPressureAction( appContext, @@ -277,8 +270,8 @@ public List evaluate() { SearchBackPressureAction.SearchbpThresholdActionDirection.INCREASE .toString(), policyConfig.getSearchbpHeapStepsizeInPercentage())); - } else if (taskHeapThresholdIsTooLarge()) { - LOG.info("taskHeapThresholdIsTooLarge action Added!"); + } else if (isTaskHeapThresholdTooLarge()) { + LOG.info("isTaskHeapThresholdTooLarge action Added!"); actions.add( new SearchBackPressureAction( appContext, From c6551f00d5ac593fb6ab6983a462e5d7a658e705 Mon Sep 17 00:00:00 2001 From: CoderJeffrey Date: Thu, 20 Jul 2023 10:55:21 -0700 Subject: [PATCH 57/73] change field description (Signed-off-by: Jeffrey Liu ujeffliu@amazon.com) Signed-off-by: CoderJeffrey --- .../SearchBackPressurePolicyConfig.java | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/configs/searchbackpressure/SearchBackPressurePolicyConfig.java b/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/configs/searchbackpressure/SearchBackPressurePolicyConfig.java index 64e732c63..cceb94c0b 100644 --- a/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/configs/searchbackpressure/SearchBackPressurePolicyConfig.java +++ b/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/configs/searchbackpressure/SearchBackPressurePolicyConfig.java @@ -20,14 +20,10 @@ * "search-back-pressure-policy-config": { "enabled": true, // whether the * serch-back-pressure-policy should be enabled "hour-breach-threshold": 30, // threshold for hourly * received unhealthy cluster level rca flow units, if above, then the below thresholds should be - * modified "threshold_count": 2, // how many thresholds to be changed, in this case - * search-heap-threshold and search-task-heap-threshold "search_task_heap_stepsize_in_percentage": - * 5, "search_task_stepsize_in_percentage": 0.5" } } Explanation of thresholds that are being - * configured and modified based on current RCA flowunits: search_task_heap_stepsize_in_percentage: - * Defines the step size to change heap usage threshold (in percentage). for the sum of heap usages - * across all search tasks before in-flight cancellation is applied. - * search_task_stepsize_in_percentage: Defines the step size to change heap usage threshold (in - * percentage) for an individual task before it is considered for cancellation. + * modified, "threshold_count": 1, // how many thresholds to be changed, in this case + * search-heap-threshold, "searchbp-heap-stepsize-in-percentage": 5, } } + * "searchbp-heap-stepsize-in-percentage" defines the step size to change heap related threshold (in + * percentage). */ public class SearchBackPressurePolicyConfig { private static final Logger LOG = LogManager.getLogger(SearchBackPressurePolicyConfig.class); From ed42e7db680ad7ccffdb012228d6cdf481c8e163 Mon Sep 17 00:00:00 2001 From: CoderJeffrey Date: Thu, 20 Jul 2023 14:36:57 -0700 Subject: [PATCH 58/73] refactor (Signed-off-by: Jeffrey Liu ujeffliu@amazon.com) Signed-off-by: CoderJeffrey --- .../actions/SearchBackPressureAction.java | 4 + .../SearchBackPressurePolicy.java | 76 ++++++++++++------- .../rca/store/rca/OldGenRca.java | 8 ++ 3 files changed, 62 insertions(+), 26 deletions(-) diff --git a/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/actions/SearchBackPressureAction.java b/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/actions/SearchBackPressureAction.java index ddd136e17..bc81c80a8 100644 --- a/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/actions/SearchBackPressureAction.java +++ b/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/actions/SearchBackPressureAction.java @@ -38,9 +38,13 @@ public class SearchBackPressureAction extends SuppressibleAction { */ private boolean canUpdate; private long coolOffPeriodInMillis; + // TODO: change, dimension, direction as enums + // private String thresholdName; + private String dimension; private String direction; + private double stepSizeInPercentage; public SearchBackPressureAction( diff --git a/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/searchbackpressure/SearchBackPressurePolicy.java b/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/searchbackpressure/SearchBackPressurePolicy.java index 3efc9a12f..0659dc721 100644 --- a/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/searchbackpressure/SearchBackPressurePolicy.java +++ b/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/searchbackpressure/SearchBackPressurePolicy.java @@ -50,7 +50,7 @@ public class SearchBackPressurePolicy implements DecisionPolicy { private static final Path SEARCHBP_DATA_FILE_PATH = Paths.get(RcaConsts.CONFIG_DIR_PATH, "SearchBackPressurePolicy_heap"); - /* TO DO: Specify a path to store SearchBackpressurePolicy_Autotune Stats */ + /* TODO: Specify a path to store SearchBackpressurePolicy_Autotune Stats */ private AppContext appContext; private RcaConf rcaConf; @@ -96,35 +96,44 @@ public SearchBackPressurePolicy(SearchBackPressureClusterRCA searchBackPressureC * @param issue an issue with the application */ private void record(HotResourceSummary issue) { + // TODO: change nested if into menaing methods like recordSearchBpShardjIssue() + // recordSearchBpTaskIssue() + LOG.info("SearchBackPressurePolicy#record()"); if (HEAP_SEARCHBP_SHARD_SIGNALS.contains(issue.getResource())) { LOG.info("Recording shard-level issue"); - // increase alarm for heap-related threshold (shard-level) - if (issue.getMetaData() == SearchBackPressureRcaConfig.INCREASE_THRESHOLD_BY_JVM_STR) { - LOG.info("recording increase-level issue for shard"); - searchBackPressureShardHeapIncreaseAlarm.recordIssue(); - } - - // decrease alarm for heap-related threshold (shard-level) - if (issue.getMetaData() == SearchBackPressureRcaConfig.DECREASE_THRESHOLD_BY_JVM_STR) { - LOG.info("recording decrease-level issue for shard"); - searchBackPressureShardHeapDecreaseAlarm.recordIssue(); - } - + recordSearchBackPressureShardIssue(issue); } else if (HEAP_SEARCHBP_TASK_SIGNALS.contains(issue.getResource())) { LOG.info("Recording Task-Level issue"); + recordSearchBackPressureTaskIssue(issue); + } + } - // increase alarm for heap-related threshold (task-level) - if (issue.getMetaData() == SearchBackPressureRcaConfig.INCREASE_THRESHOLD_BY_JVM_STR) { - LOG.info("recording increase-level issue for task"); - searchBackPressureTaskHeapIncreaseAlarm.recordIssue(); - } + private void recordSearchBackPressureShardIssue(HotResourceSummary issue) { + // increase alarm for heap-related threshold (shard-level) + if (issue.getMetaData() == SearchBackPressureRcaConfig.INCREASE_THRESHOLD_BY_JVM_STR) { + LOG.info("recording increase-level issue for shard"); + searchBackPressureShardHeapIncreaseAlarm.recordIssue(); + } - // decrease alarm for heap-related threshold (task-level) - if (issue.getMetaData() == SearchBackPressureRcaConfig.DECREASE_THRESHOLD_BY_JVM_STR) { - LOG.info("recording decrease-level issue for task"); - searchBackPressureTaskHeapDecreaseAlarm.recordIssue(); - } + // decrease alarm for heap-related threshold (shard-level) + if (issue.getMetaData() == SearchBackPressureRcaConfig.DECREASE_THRESHOLD_BY_JVM_STR) { + LOG.info("recording decrease-level issue for shard"); + searchBackPressureShardHeapDecreaseAlarm.recordIssue(); + } + } + + private void recordSearchBackPressureTaskIssue(HotResourceSummary issue) { + // increase alarm for heap-related threshold (task-level) + if (issue.getMetaData() == SearchBackPressureRcaConfig.INCREASE_THRESHOLD_BY_JVM_STR) { + LOG.info("recording increase-level issue for task"); + searchBackPressureTaskHeapIncreaseAlarm.recordIssue(); + } + + // decrease alarm for heap-related threshold (task-level) + if (issue.getMetaData() == SearchBackPressureRcaConfig.DECREASE_THRESHOLD_BY_JVM_STR) { + LOG.info("recording decrease-level issue for task"); + searchBackPressureTaskHeapDecreaseAlarm.recordIssue(); } } @@ -233,7 +242,20 @@ public List evaluate() { initialize(); recordIssues(); + // TODO: Search Task and Shard Alarm Should be seperated + // checkShardAlamr() and but first 2 ifs inside? + // checkTaskAlarms() and but last 2 ifs inside? + + checkShardAlarms(actions); + checkTaskAlarms(actions); + + // print current size of the actions + LOG.info("SearchBackPressurePolicy#evaluate() action size: {}", actions.size()); + + return actions; + } + private void checkShardAlarms(List actions) { if (isShardHeapThresholdTooSmall()) { LOG.info("isShardHeapThresholdTooSmall action Added!"); actions.add( @@ -258,7 +280,11 @@ public List evaluate() { SearchBackPressureAction.SearchbpThresholdActionDirection.DECREASE .toString(), policyConfig.getSearchbpHeapStepsizeInPercentage())); - } else if (isTaskHeapThresholdTooSmall()) { + } + } + + private void checkTaskAlarms(List actions) { + if (isTaskHeapThresholdTooSmall()) { LOG.info("isTaskHeapThresholdTooSmall action Added!"); actions.add( new SearchBackPressureAction( @@ -283,8 +309,6 @@ public List evaluate() { .toString(), policyConfig.getSearchbpHeapStepsizeInPercentage())); } - - return actions; } public void setAppContext(AppContext appContext) { diff --git a/src/main/java/org/opensearch/performanceanalyzer/rca/store/rca/OldGenRca.java b/src/main/java/org/opensearch/performanceanalyzer/rca/store/rca/OldGenRca.java index 26624bc11..e6a9c5aa2 100644 --- a/src/main/java/org/opensearch/performanceanalyzer/rca/store/rca/OldGenRca.java +++ b/src/main/java/org/opensearch/performanceanalyzer/rca/store/rca/OldGenRca.java @@ -260,6 +260,7 @@ public double readMin() { */ public static class MinMaxSlidingWindow extends SlidingWindow { boolean isMinSlidingWindow; + // change the boolean to the Biconsumer public MinMaxSlidingWindow( int SLIDING_WINDOW_SIZE_IN_TIMESTAMP, @@ -267,10 +268,17 @@ public MinMaxSlidingWindow( boolean isMinSlidingWindow) { super(SLIDING_WINDOW_SIZE_IN_TIMESTAMP, timeUnit); this.isMinSlidingWindow = isMinSlidingWindow; + + // get the Biconsumer from the client } @Override public void next(SlidingWindowData e) { + // TODO: refactored into a lambda function and have that lambda as a member variable? + // () => {} + + // Biconsumer to add based on conditions + if (isMinSlidingWindow) { // monotonically decreasing sliding window while (!windowDeque.isEmpty() From 3709bf0abb3a00ad9130459b565faad1b70c420b Mon Sep 17 00:00:00 2001 From: CoderJeffrey Date: Thu, 20 Jul 2023 16:05:45 -0700 Subject: [PATCH 59/73] enum added for direction and shard/task dimension (Signed-off-by: Jeffrey Liu ujeffliu@amazon.com) Signed-off-by: CoderJeffrey --- .../actions/SearchBackPressureAction.java | 29 ++++++++++++++++--- .../SearchBackPressurePolicy.java | 8 ++--- 2 files changed, 29 insertions(+), 8 deletions(-) diff --git a/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/actions/SearchBackPressureAction.java b/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/actions/SearchBackPressureAction.java index bc81c80a8..9f7ccd2f7 100644 --- a/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/actions/SearchBackPressureAction.java +++ b/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/actions/SearchBackPressureAction.java @@ -38,13 +38,11 @@ public class SearchBackPressureAction extends SuppressibleAction { */ private boolean canUpdate; private long coolOffPeriodInMillis; - // TODO: change, dimension, direction as enums - // private String thresholdName; + // TODO: change dimension, direction as enum private String dimension; private String direction; - private double stepSizeInPercentage; public SearchBackPressureAction( @@ -135,7 +133,7 @@ public static final class Builder { private final AppContext appContext; private final String thresholdName; private final String dimension; - private final String direction; + private String direction; private boolean canUpdate; private double stepSizeInPercentage; private long coolOffPeriodInMillis; @@ -281,4 +279,27 @@ public static class Constants { public static final String DECREASE = "decrease"; } } + + // enum to indicate to whether the action is caused by shard/task level searchbackpressure + // cancellation + public enum SearchbpDimension { + SHARD(SearchbpDimension.Constants.SHARD), + TASK(SearchbpDimension.Constants.TASK); + + private final String value; + + SearchbpDimension(String value) { + this.value = value; + } + + @Override + public String toString() { + return value; + } + + public static class Constants { + public static final String SHARD = "shard"; + public static final String TASK = "task"; + } + } } diff --git a/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/searchbackpressure/SearchBackPressurePolicy.java b/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/searchbackpressure/SearchBackPressurePolicy.java index 0659dc721..19341e857 100644 --- a/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/searchbackpressure/SearchBackPressurePolicy.java +++ b/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/searchbackpressure/SearchBackPressurePolicy.java @@ -264,7 +264,7 @@ private void checkShardAlarms(List actions) { true, DEAFULT_COOLOFF_PERIOD_IN_MILLIS, HEAP_THRESHOLD_STR, - SHARD_DIMENSION_STR, + SearchBackPressureAction.SearchbpDimension.SHARD.toString(), SearchBackPressureAction.SearchbpThresholdActionDirection.INCREASE .toString(), policyConfig.getSearchbpHeapStepsizeInPercentage())); @@ -276,7 +276,7 @@ private void checkShardAlarms(List actions) { true, DEAFULT_COOLOFF_PERIOD_IN_MILLIS, HEAP_THRESHOLD_STR, - SHARD_DIMENSION_STR, + SearchBackPressureAction.SearchbpDimension.SHARD.toString(), SearchBackPressureAction.SearchbpThresholdActionDirection.DECREASE .toString(), policyConfig.getSearchbpHeapStepsizeInPercentage())); @@ -292,7 +292,7 @@ private void checkTaskAlarms(List actions) { true, DEAFULT_COOLOFF_PERIOD_IN_MILLIS, HEAP_THRESHOLD_STR, - TASK_DIMENSION_STR, + SearchBackPressureAction.SearchbpDimension.TASK.toString(), SearchBackPressureAction.SearchbpThresholdActionDirection.INCREASE .toString(), policyConfig.getSearchbpHeapStepsizeInPercentage())); @@ -304,7 +304,7 @@ private void checkTaskAlarms(List actions) { true, DEAFULT_COOLOFF_PERIOD_IN_MILLIS, HEAP_THRESHOLD_STR, - TASK_DIMENSION_STR, + SearchBackPressureAction.SearchbpDimension.TASK.toString(), SearchBackPressureAction.SearchbpThresholdActionDirection.DECREASE .toString(), policyConfig.getSearchbpHeapStepsizeInPercentage())); From db55b0353f0815b9c97feed6bbe51b7a6bdba12d Mon Sep 17 00:00:00 2001 From: CoderJeffrey Date: Fri, 21 Jul 2023 11:36:55 -0700 Subject: [PATCH 60/73] Use enum for Dimension/Direction for Searchbp Action (Signed-off-by: Jeffrey Liu ujeffliu@amazon.com) Signed-off-by: CoderJeffrey --- .../actions/SearchBackPressureAction.java | 24 +++++++++---------- .../SearchBackPressurePolicy.java | 20 +++++++--------- 2 files changed, 20 insertions(+), 24 deletions(-) diff --git a/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/actions/SearchBackPressureAction.java b/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/actions/SearchBackPressureAction.java index 9f7ccd2f7..880da14bd 100644 --- a/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/actions/SearchBackPressureAction.java +++ b/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/actions/SearchBackPressureAction.java @@ -41,8 +41,8 @@ public class SearchBackPressureAction extends SuppressibleAction { private String thresholdName; // TODO: change dimension, direction as enum - private String dimension; - private String direction; + private SearchbpDimension dimension; + private SearchbpThresholdActionDirection direction; private double stepSizeInPercentage; public SearchBackPressureAction( @@ -50,8 +50,8 @@ public SearchBackPressureAction( final boolean canUpdate, final long coolOffPeriodInMillis, final String thresholdName, - final String dimension, - final String direction, + final SearchbpDimension dimension, + final SearchbpThresholdActionDirection direction, final double stepSizeInPercentage) { super(appContext); this.canUpdate = canUpdate; @@ -103,11 +103,11 @@ public String getThresholdName() { } public String getDimension() { - return dimension; + return dimension.toString(); } public String getDirection() { - return direction; + return direction.toString(); } public double getStepSizeInPercentage() { @@ -119,8 +119,8 @@ public String summary() { Summary summary = new Summary( thresholdName, - dimension, - direction, + dimension.toString(), + direction.toString(), stepSizeInPercentage, DEFAULT_COOL_OFF_PERIOD_IN_MILLIS, canUpdate); @@ -132,8 +132,8 @@ public static final class Builder { private final AppContext appContext; private final String thresholdName; - private final String dimension; - private String direction; + private final SearchbpDimension dimension; + private final SearchbpThresholdActionDirection direction; private boolean canUpdate; private double stepSizeInPercentage; private long coolOffPeriodInMillis; @@ -141,8 +141,8 @@ public static final class Builder { private Builder( final AppContext appContext, final String thresholdName, - final String dimension, - final String direction, + final SearchbpDimension dimension, + final SearchbpThresholdActionDirection direction, final long coolOffPeriodInMillis) { this.appContext = appContext; this.thresholdName = thresholdName; diff --git a/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/searchbackpressure/SearchBackPressurePolicy.java b/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/searchbackpressure/SearchBackPressurePolicy.java index 19341e857..3f793b900 100644 --- a/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/searchbackpressure/SearchBackPressurePolicy.java +++ b/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/searchbackpressure/SearchBackPressurePolicy.java @@ -264,9 +264,8 @@ private void checkShardAlarms(List actions) { true, DEAFULT_COOLOFF_PERIOD_IN_MILLIS, HEAP_THRESHOLD_STR, - SearchBackPressureAction.SearchbpDimension.SHARD.toString(), - SearchBackPressureAction.SearchbpThresholdActionDirection.INCREASE - .toString(), + SearchBackPressureAction.SearchbpDimension.SHARD, + SearchBackPressureAction.SearchbpThresholdActionDirection.INCREASE, policyConfig.getSearchbpHeapStepsizeInPercentage())); } else if (isShardHeapThresholdTooLarge()) { LOG.info("isShardHeapThresholdTooLarge action Added!"); @@ -276,9 +275,8 @@ private void checkShardAlarms(List actions) { true, DEAFULT_COOLOFF_PERIOD_IN_MILLIS, HEAP_THRESHOLD_STR, - SearchBackPressureAction.SearchbpDimension.SHARD.toString(), - SearchBackPressureAction.SearchbpThresholdActionDirection.DECREASE - .toString(), + SearchBackPressureAction.SearchbpDimension.SHARD, + SearchBackPressureAction.SearchbpThresholdActionDirection.DECREASE, policyConfig.getSearchbpHeapStepsizeInPercentage())); } } @@ -292,9 +290,8 @@ private void checkTaskAlarms(List actions) { true, DEAFULT_COOLOFF_PERIOD_IN_MILLIS, HEAP_THRESHOLD_STR, - SearchBackPressureAction.SearchbpDimension.TASK.toString(), - SearchBackPressureAction.SearchbpThresholdActionDirection.INCREASE - .toString(), + SearchBackPressureAction.SearchbpDimension.TASK, + SearchBackPressureAction.SearchbpThresholdActionDirection.INCREASE, policyConfig.getSearchbpHeapStepsizeInPercentage())); } else if (isTaskHeapThresholdTooLarge()) { LOG.info("isTaskHeapThresholdTooLarge action Added!"); @@ -304,9 +301,8 @@ private void checkTaskAlarms(List actions) { true, DEAFULT_COOLOFF_PERIOD_IN_MILLIS, HEAP_THRESHOLD_STR, - SearchBackPressureAction.SearchbpDimension.TASK.toString(), - SearchBackPressureAction.SearchbpThresholdActionDirection.DECREASE - .toString(), + SearchBackPressureAction.SearchbpDimension.TASK, + SearchBackPressureAction.SearchbpThresholdActionDirection.DECREASE, policyConfig.getSearchbpHeapStepsizeInPercentage())); } } From f99424ff3c45847afec1240940c0e47cbe3fd58f Mon Sep 17 00:00:00 2001 From: CoderJeffrey Date: Fri, 21 Jul 2023 13:54:24 -0700 Subject: [PATCH 61/73] Add lambda function for next() in OldGenRCA (Signed-off-by: Jeffrey Liu ujeffliu@amazon.com) Signed-off-by: CoderJeffrey --- .../rca/store/rca/OldGenRca.java | 33 +++++-------------- .../SearchBackPressureRCA.java | 30 +++++++++++++++-- 2 files changed, 37 insertions(+), 26 deletions(-) diff --git a/src/main/java/org/opensearch/performanceanalyzer/rca/store/rca/OldGenRca.java b/src/main/java/org/opensearch/performanceanalyzer/rca/store/rca/OldGenRca.java index e6a9c5aa2..08d161be3 100644 --- a/src/main/java/org/opensearch/performanceanalyzer/rca/store/rca/OldGenRca.java +++ b/src/main/java/org/opensearch/performanceanalyzer/rca/store/rca/OldGenRca.java @@ -6,8 +6,10 @@ package org.opensearch.performanceanalyzer.rca.store.rca; +import java.util.Deque; import java.util.List; import java.util.concurrent.TimeUnit; +import java.util.function.BiConsumer; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; import org.jooq.Field; @@ -259,41 +261,24 @@ public double readMin() { * be implemented as minSlidingWindow or maxSlidingWindow depending on the need. */ public static class MinMaxSlidingWindow extends SlidingWindow { - boolean isMinSlidingWindow; - // change the boolean to the Biconsumer + BiConsumer, SlidingWindowData> nextElementFunc; public MinMaxSlidingWindow( int SLIDING_WINDOW_SIZE_IN_TIMESTAMP, TimeUnit timeUnit, - boolean isMinSlidingWindow) { + BiConsumer, SlidingWindowData> nextElementFunc) { super(SLIDING_WINDOW_SIZE_IN_TIMESTAMP, timeUnit); - this.isMinSlidingWindow = isMinSlidingWindow; - // get the Biconsumer from the client + // get the Biconsumer lambda function passed in + this.nextElementFunc = nextElementFunc; } @Override public void next(SlidingWindowData e) { - // TODO: refactored into a lambda function and have that lambda as a member variable? - // () => {} + // use the passed in lambda function to accept next element + nextElementFunc.accept(windowDeque, e); - // Biconsumer to add based on conditions - - if (isMinSlidingWindow) { - // monotonically decreasing sliding window - while (!windowDeque.isEmpty() - && windowDeque.peekFirst().getValue() >= e.getValue()) { - windowDeque.pollFirst(); - } - } else { - // monotonically increasing sliding window - while (!windowDeque.isEmpty() - && windowDeque.peekFirst().getValue() < e.getValue()) { - windowDeque.pollFirst(); - } - } - - windowDeque.addFirst(e); + // evict elements in sliding window outside the sliding window size while (!windowDeque.isEmpty() && TimeUnit.MILLISECONDS.toSeconds( e.getTimeStamp() - windowDeque.peekLast().getTimeStamp()) diff --git a/src/main/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressureRCA.java b/src/main/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressureRCA.java index e8a50d064..a0299b36a 100644 --- a/src/main/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressureRCA.java +++ b/src/main/java/org/opensearch/performanceanalyzer/rca/store/rca/searchbackpressure/SearchBackPressureRCA.java @@ -11,8 +11,10 @@ import java.time.Clock; import java.util.ArrayList; +import java.util.Deque; import java.util.List; import java.util.concurrent.TimeUnit; +import java.util.function.BiConsumer; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; import org.jooq.Field; @@ -98,6 +100,28 @@ public class SearchBackPressureRCA extends Rca> // Current time protected Clock clock; + // lambda function to add nextElement to monotonically decreasing sliding window + BiConsumer, SlidingWindowData> minSlidingWindowNextElement = + (windowDeque, nextElement) -> { + while (!windowDeque.isEmpty() + && windowDeque.peekFirst().getValue() >= nextElement.getValue()) { + windowDeque.pollFirst(); + } + + windowDeque.addFirst(nextElement); + }; + + // lambda function to add nextElement to monotonically increasing sliding window + BiConsumer, SlidingWindowData> maxSlidingWindowNextElement = + (windowDeque, nextElement) -> { + while (!windowDeque.isEmpty() + && windowDeque.peekFirst().getValue() < nextElement.getValue()) { + windowDeque.pollFirst(); + } + + windowDeque.addFirst(nextElement); + }; + public SearchBackPressureRCA( final int rcaPeriod, final M heapMax, final M heapUsed, M searchbp_Stats) { super(EVAL_INTERVAL_IN_S); @@ -129,9 +153,11 @@ public SearchBackPressureRCA( // sliding window for heap usage this.minHeapUsageSlidingWindow = - new MinMaxSlidingWindow(SLIDING_WINDOW_SIZE_IN_MINS, TimeUnit.MINUTES, true); + new MinMaxSlidingWindow( + SLIDING_WINDOW_SIZE_IN_MINS, TimeUnit.MINUTES, minSlidingWindowNextElement); this.maxHeapUsageSlidingWindow = - new MinMaxSlidingWindow(SLIDING_WINDOW_SIZE_IN_MINS, TimeUnit.MINUTES, false); + new MinMaxSlidingWindow( + SLIDING_WINDOW_SIZE_IN_MINS, TimeUnit.MINUTES, maxSlidingWindowNextElement); // sliding window for JVM this.shardJVMCancellationSlidingWindow = From e34759bd3d8f74a1e595d8f58de4f0538f726fb1 Mon Sep 17 00:00:00 2001 From: CoderJeffrey Date: Fri, 21 Jul 2023 14:27:54 -0700 Subject: [PATCH 62/73] Merged main with build.gradle (Signed-off-by: Jeffrey Liu ujeffliu@amazon.com) Signed-off-by: CoderJeffrey --- build.gradle | 1 - 1 file changed, 1 deletion(-) diff --git a/build.gradle b/build.gradle index 0d4fbd466..25b70ea22 100644 --- a/build.gradle +++ b/build.gradle @@ -373,7 +373,6 @@ dependencies { strictly "2.23.0" } } - testImplementation group: 'org.powermock', name: 'powermock-core', version: '2.0.0' testImplementation group: 'org.powermock', name: 'powermock-api-support', version: '2.0.0' testImplementation group: 'org.powermock', name: 'powermock-module-junit4-common', version: '2.0.0' From a00dff4f666e601002340b84407fe490d4bc5581 Mon Sep 17 00:00:00 2001 From: CoderJeffrey Date: Thu, 27 Jul 2023 10:30:42 -0700 Subject: [PATCH 63/73] resolve nit#1 (Signed-off-by: Jeffrey Liu ujeffliu@amazon.com) Signed-off-by: CoderJeffrey --- .../actions/SearchBackPressureAction.java | 1 - .../SearchBackPressurePolicyConfig.java | 12 ++++++------ 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/actions/SearchBackPressureAction.java b/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/actions/SearchBackPressureAction.java index 880da14bd..5f50c4e01 100644 --- a/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/actions/SearchBackPressureAction.java +++ b/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/actions/SearchBackPressureAction.java @@ -40,7 +40,6 @@ public class SearchBackPressureAction extends SuppressibleAction { private long coolOffPeriodInMillis; private String thresholdName; - // TODO: change dimension, direction as enum private SearchbpDimension dimension; private SearchbpThresholdActionDirection direction; private double stepSizeInPercentage; diff --git a/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/configs/searchbackpressure/SearchBackPressurePolicyConfig.java b/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/configs/searchbackpressure/SearchBackPressurePolicyConfig.java index cceb94c0b..295ba9276 100644 --- a/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/configs/searchbackpressure/SearchBackPressurePolicyConfig.java +++ b/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/configs/searchbackpressure/SearchBackPressurePolicyConfig.java @@ -37,11 +37,11 @@ public class SearchBackPressurePolicyConfig { // Default values public static final boolean DEFAULT_ENABLED = true; - // TO DO Decide the Defauilt Hour breach threshold + + // TO DO: Decide the Default Hour breach threshold public static final int DEFAULT_HOUR_BREACH_THRESHOLD = 2; - public static final int DEFAULT_HOUR_MONITOR_WINDOW_SIZE_MINUTES = - (int) TimeUnit.HOURS.toMinutes(1); - public static final int DEFAULT_HOUR_MONITOR_BUCKET_SIZE_MINUTES = 1; + public static final int HOUR_MONITOR_WINDOW_SIZE_MINUTES = (int) TimeUnit.HOURS.toMinutes(1); + public static final int HOUR_MONITOR_BUCKET_SIZE_MINUTES = 1; public static final double DEFAULT_SEARCHBP_HEAP_STEPSIZE_IN_PERCENTAGE = 5; private Config hourBreachThreshold; @@ -85,11 +85,11 @@ public int getHourBreachThreshold() { } public int getHourMonitorWindowSizeMinutes() { - return DEFAULT_HOUR_MONITOR_WINDOW_SIZE_MINUTES; + return HOUR_MONITOR_WINDOW_SIZE_MINUTES; } public int getHourMonitorBucketSizeMinutes() { - return DEFAULT_HOUR_MONITOR_BUCKET_SIZE_MINUTES; + return HOUR_MONITOR_BUCKET_SIZE_MINUTES; } public double getSearchbpHeapStepsizeInPercentage() { From 7befe7b19896b76a38264646618e44a877778245 Mon Sep 17 00:00:00 2001 From: CoderJeffrey Date: Thu, 27 Jul 2023 10:55:05 -0700 Subject: [PATCH 64/73] resolve nit#2 (Signed-off-by: Jeffrey Liu ujeffliu@amazon.com) Signed-off-by: CoderJeffrey --- .../searchbackpressure/SearchBackPressureDecider.java | 7 ++++--- .../performanceanalyzer/rca/store/rca/OldGenRca.java | 2 +- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/searchbackpressure/SearchBackPressureDecider.java b/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/searchbackpressure/SearchBackPressureDecider.java index ebe7b6986..283143f73 100644 --- a/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/searchbackpressure/SearchBackPressureDecider.java +++ b/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/searchbackpressure/SearchBackPressureDecider.java @@ -22,8 +22,7 @@ public class SearchBackPressureDecider extends Decider { private static final Logger LOG = LogManager.getLogger(SearchBackPressureDecider.class); public static final String NAME = "SearchBackPressureDecider"; - /* TO ADD: SearchBackPressureDecider should have SeachBackPressurePolicy able to evaluate the search back pressure actions */ - SearchBackPressurePolicy searchBackPressurePolicy; + private final SearchBackPressurePolicy searchBackPressurePolicy; private int currentIteration = 0; private SearchBackPressureClusterRCA searchBackPressureClusterRCA; @@ -55,10 +54,12 @@ public Decision operate() { return decision; } + // reset the currentIteration for next action emitting cycle currentIteration = 0; - // SearchBackPressure Policy is always accepted + // SearchBackPressure Policy is always accepted since Searchbp Decider only use the actions suggested by Searchbp Policy List searchBackPressureActions = searchBackPressurePolicy.evaluate(); + // loop through the actions and print the action threshold name, dimension, // increase/decrease for (int i = 0; i < searchBackPressureActions.size(); i++) { diff --git a/src/main/java/org/opensearch/performanceanalyzer/rca/store/rca/OldGenRca.java b/src/main/java/org/opensearch/performanceanalyzer/rca/store/rca/OldGenRca.java index fefd40ba3..a7b2c25ac 100644 --- a/src/main/java/org/opensearch/performanceanalyzer/rca/store/rca/OldGenRca.java +++ b/src/main/java/org/opensearch/performanceanalyzer/rca/store/rca/OldGenRca.java @@ -254,7 +254,7 @@ public double readMin() { } /** - * Sliding window to check the max/min olg gen usage within a given time frame + * Sliding window to check the max/min old gen usage within a given time frame * * @param isMinSlidingWindow true if the sliding window is for min usage, false for max usage * Provides a more general framework than MinOldGenSlidingWindow as this sliding window can From de4d2894d221cf31b7a0f5dc0baced5c1bf3804d Mon Sep 17 00:00:00 2001 From: CoderJeffrey Date: Thu, 27 Jul 2023 11:27:43 -0700 Subject: [PATCH 65/73] refactor OpenSearchAnalysisGraph (Signed-off-by: Jeffrey Liu ujeffliu@amazon.com) Signed-off-by: CoderJeffrey --- .../SearchBackPressureDecider.java | 3 +- .../rca/store/OpenSearchAnalysisGraph.java | 64 +++++++++++-------- 2 files changed, 38 insertions(+), 29 deletions(-) diff --git a/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/searchbackpressure/SearchBackPressureDecider.java b/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/searchbackpressure/SearchBackPressureDecider.java index 283143f73..fc8ef2730 100644 --- a/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/searchbackpressure/SearchBackPressureDecider.java +++ b/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/searchbackpressure/SearchBackPressureDecider.java @@ -57,7 +57,8 @@ public Decision operate() { // reset the currentIteration for next action emitting cycle currentIteration = 0; - // SearchBackPressure Policy is always accepted since Searchbp Decider only use the actions suggested by Searchbp Policy + // SearchBackPressure Policy is always accepted since Searchbp Decider only use the actions + // suggested by Searchbp Policy List searchBackPressureActions = searchBackPressurePolicy.evaluate(); // loop through the actions and print the action threshold name, dimension, diff --git a/src/main/java/org/opensearch/performanceanalyzer/rca/store/OpenSearchAnalysisGraph.java b/src/main/java/org/opensearch/performanceanalyzer/rca/store/OpenSearchAnalysisGraph.java index caa7f9690..a9423ccd0 100644 --- a/src/main/java/org/opensearch/performanceanalyzer/rca/store/OpenSearchAnalysisGraph.java +++ b/src/main/java/org/opensearch/performanceanalyzer/rca/store/OpenSearchAnalysisGraph.java @@ -444,35 +444,9 @@ public void construct() { shardRequestCacheClusterRca, highHeapUsageClusterRca)); - // Search Back Pressure Service RCA enabled - SearchBackPressureRCA searchBackPressureRCA = - new SearchBackPressureRCA(RCA_PERIOD, heapMax, heapUsed, searchbp_Stats); - searchBackPressureRCA.addTag( - RcaConsts.RcaTagConstants.TAG_LOCUS, - RcaConsts.RcaTagConstants.LOCUS_DATA_CLUSTER_MANAGER_NODE); - searchBackPressureRCA.addAllUpstreams(Arrays.asList(heapMax, heapUsed, searchbp_Stats)); - - // Search Back Pressure Service Cluster RCA enabled - SearchBackPressureClusterRCA searchBackPressureClusterRCA = - new SearchBackPressureClusterRCA(RCA_PERIOD, searchBackPressureRCA); - searchBackPressureClusterRCA.addTag( - RcaConsts.RcaTagConstants.TAG_LOCUS, - RcaConsts.RcaTagConstants.LOCUS_CLUSTER_MANAGER_NODE); - searchBackPressureClusterRCA.addAllUpstreams( - Collections.singletonList(searchBackPressureRCA)); - searchBackPressureClusterRCA.addTag( - RcaConsts.RcaTagConstants.TAG_AGGREGATE_UPSTREAM, - RcaConsts.RcaTagConstants.LOCUS_DATA_NODE); - - // SearchBackPressure Decider + // SearchBackPressure RCA Decider SearchBackPressureDecider searchBackPressureDecider = - new SearchBackPressureDecider( - EVALUATION_INTERVAL_SECONDS, 12, searchBackPressureClusterRCA); - searchBackPressureDecider.addTag( - RcaConsts.RcaTagConstants.TAG_LOCUS, - RcaConsts.RcaTagConstants.LOCUS_CLUSTER_MANAGER_NODE); - searchBackPressureDecider.addAllUpstreams( - Collections.singletonList(searchBackPressureClusterRCA)); + buildSearchBackPressureDecider(heapMax, heapUsed, searchbp_Stats); // AdmissionControl RCA Decider AdmissionControlDecider admissionControlDecider = @@ -514,6 +488,40 @@ public void construct() { pluginController.initPlugins(); } + private SearchBackPressureDecider buildSearchBackPressureDecider( + Metric heapMax, Metric heapUsed, Metric searchbp_Stats) { + // Enbale SearchBackPressure node-level RCA + SearchBackPressureRCA searchBackPressureRCA = + new SearchBackPressureRCA(RCA_PERIOD, heapMax, heapUsed, searchbp_Stats); + searchBackPressureRCA.addTag( + RcaConsts.RcaTagConstants.TAG_LOCUS, + RcaConsts.RcaTagConstants.LOCUS_DATA_CLUSTER_MANAGER_NODE); + searchBackPressureRCA.addAllUpstreams(Arrays.asList(heapMax, heapUsed, searchbp_Stats)); + + // Enable SearchBackPressure cluster-level RCA + SearchBackPressureClusterRCA searchBackPressureClusterRCA = + new SearchBackPressureClusterRCA(RCA_PERIOD, searchBackPressureRCA); + searchBackPressureClusterRCA.addTag( + RcaConsts.RcaTagConstants.TAG_LOCUS, + RcaConsts.RcaTagConstants.LOCUS_CLUSTER_MANAGER_NODE); + searchBackPressureClusterRCA.addAllUpstreams( + Collections.singletonList(searchBackPressureRCA)); + searchBackPressureClusterRCA.addTag( + RcaConsts.RcaTagConstants.TAG_AGGREGATE_UPSTREAM, + RcaConsts.RcaTagConstants.LOCUS_DATA_NODE); + + // Enabel SearchBackPressureDecider + SearchBackPressureDecider searchBackPressureDecider = + new SearchBackPressureDecider( + EVALUATION_INTERVAL_SECONDS, RCA_PERIOD, searchBackPressureClusterRCA); + searchBackPressureDecider.addTag( + RcaConsts.RcaTagConstants.TAG_LOCUS, + RcaConsts.RcaTagConstants.LOCUS_CLUSTER_MANAGER_NODE); + searchBackPressureDecider.addAllUpstreams( + Collections.singletonList(searchBackPressureClusterRCA)); + return searchBackPressureDecider; + } + private AdmissionControlDecider buildAdmissionControlDecider(Metric heapUsed, Metric heapMax) { AdmissionControlRca admissionControlRca = new AdmissionControlRca(RCA_PERIOD, heapUsed, heapMax); From 4635d6c17702be5ad469863276d3d47cd254853d Mon Sep 17 00:00:00 2001 From: CoderJeffrey Date: Thu, 27 Jul 2023 12:23:58 -0700 Subject: [PATCH 66/73] stream() refactor (Signed-off-by: Jeffrey Liu ujeffliu@amazon.com) Signed-off-by: CoderJeffrey --- .../SearchBackPressureDecider.java | 20 +++++++++---------- .../SearchBackPressurePolicy.java | 11 ++++------ 2 files changed, 14 insertions(+), 17 deletions(-) diff --git a/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/searchbackpressure/SearchBackPressureDecider.java b/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/searchbackpressure/SearchBackPressureDecider.java index fc8ef2730..2290df673 100644 --- a/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/searchbackpressure/SearchBackPressureDecider.java +++ b/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/searchbackpressure/SearchBackPressureDecider.java @@ -63,16 +63,16 @@ public Decision operate() { // loop through the actions and print the action threshold name, dimension, // increase/decrease - for (int i = 0; i < searchBackPressureActions.size(); i++) { - LOG.info( - "Action details, threshold name: {}, dimension: {}, increase/decrease: {}, stepsize: {}", - ((SearchBackPressureAction) searchBackPressureActions.get(i)) - .getThresholdName(), - ((SearchBackPressureAction) searchBackPressureActions.get(i)).getDimension(), - ((SearchBackPressureAction) searchBackPressureActions.get(i)).getDirection(), - ((SearchBackPressureAction) searchBackPressureActions.get(i)) - .getStepSizeInPercentage()); - } + searchBackPressureActions.stream() + .forEach( + (action) -> { + LOG.info( + "searchBackPressureActions details, threshold name: {}, dimension: {}, increase/decrease: {}, stepsize: {}", + ((SearchBackPressureAction) action).getThresholdName(), + ((SearchBackPressureAction) action).getDimension(), + ((SearchBackPressureAction) action).getDirection(), + ((SearchBackPressureAction) action).getStepSizeInPercentage()); + }); searchBackPressureActions.forEach(decision::addAction); diff --git a/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/searchbackpressure/SearchBackPressurePolicy.java b/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/searchbackpressure/SearchBackPressurePolicy.java index 3f793b900..c0ab09433 100644 --- a/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/searchbackpressure/SearchBackPressurePolicy.java +++ b/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/searchbackpressure/SearchBackPressurePolicy.java @@ -27,7 +27,6 @@ import org.opensearch.performanceanalyzer.rca.framework.api.aggregators.BucketizedSlidingWindowConfig; import org.opensearch.performanceanalyzer.rca.framework.api.flow_units.ResourceFlowUnit; import org.opensearch.performanceanalyzer.rca.framework.api.summaries.HotClusterSummary; -import org.opensearch.performanceanalyzer.rca.framework.api.summaries.HotNodeSummary; import org.opensearch.performanceanalyzer.rca.framework.api.summaries.HotResourceSummary; import org.opensearch.performanceanalyzer.rca.framework.core.RcaConf; import org.opensearch.performanceanalyzer.rca.framework.util.RcaConsts; @@ -152,13 +151,11 @@ private void recordIssues() { if (!flowUnit.hasResourceSummary()) { continue; } - // print out the total number of flow units in length + HotClusterSummary clusterSummary = flowUnit.getSummary(); - for (HotNodeSummary nodeSummary : clusterSummary.getHotNodeSummaryList()) { - for (HotResourceSummary summary : nodeSummary.getHotResourceSummaryList()) { - record(summary); - } - } + clusterSummary.getHotNodeSummaryList().stream() + .flatMap((nodeSummary) -> nodeSummary.getHotResourceSummaryList().stream()) + .forEach((resourceSummary) -> record(resourceSummary)); } } From a2591cbddf9a990b1e2e2e53d1893c1efa7c73cd Mon Sep 17 00:00:00 2001 From: CoderJeffrey Date: Thu, 27 Jul 2023 13:15:30 -0700 Subject: [PATCH 67/73] null check refactor (Signed-off-by: Jeffrey Liu ujeffliu@amazon.com) Signed-off-by: CoderJeffrey --- .../SearchBackPressurePolicy.java | 30 ++++++++++--------- 1 file changed, 16 insertions(+), 14 deletions(-) diff --git a/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/searchbackpressure/SearchBackPressurePolicy.java b/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/searchbackpressure/SearchBackPressurePolicy.java index c0ab09433..ac737b284 100644 --- a/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/searchbackpressure/SearchBackPressurePolicy.java +++ b/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/searchbackpressure/SearchBackPressurePolicy.java @@ -201,23 +201,28 @@ public SearchBpActionsAlarmMonitor createAlarmMonitor(Path persistenceBasePath) public void initialize() { LOG.info("Initializing alarms with dummy path"); // initialize shard level alarm for resounce unit that suggests to increase jvm threshold - if (searchBackPressureShardHeapIncreaseAlarm == null) { - searchBackPressureShardHeapIncreaseAlarm = createAlarmMonitor(SEARCHBP_DATA_FILE_PATH); - } + searchBackPressureShardHeapIncreaseAlarm = + initializeAlarmMonitor(searchBackPressureShardHeapIncreaseAlarm); // initialize shard level alarm for resounce unit that suggests to decrease jvm threshold - if (searchBackPressureShardHeapDecreaseAlarm == null) { - searchBackPressureShardHeapDecreaseAlarm = createAlarmMonitor(SEARCHBP_DATA_FILE_PATH); - } + searchBackPressureShardHeapDecreaseAlarm = + initializeAlarmMonitor(searchBackPressureShardHeapDecreaseAlarm); // initialize task level alarm for resounce unit that suggests to increase jvm threshold - if (searchBackPressureTaskHeapIncreaseAlarm == null) { - searchBackPressureTaskHeapIncreaseAlarm = createAlarmMonitor(SEARCHBP_DATA_FILE_PATH); - } + searchBackPressureTaskHeapIncreaseAlarm = + initializeAlarmMonitor(searchBackPressureTaskHeapIncreaseAlarm); // initialize task level alarm for resounce unit that suggests to decrease jvm threhsold - if (searchBackPressureTaskHeapDecreaseAlarm == null) { - searchBackPressureTaskHeapDecreaseAlarm = createAlarmMonitor(SEARCHBP_DATA_FILE_PATH); + searchBackPressureTaskHeapDecreaseAlarm = + initializeAlarmMonitor(searchBackPressureTaskHeapDecreaseAlarm); + } + + private SearchBpActionsAlarmMonitor initializeAlarmMonitor( + SearchBpActionsAlarmMonitor alarmMonitor) { + if (alarmMonitor == null) { + return createAlarmMonitor(SEARCHBP_DATA_FILE_PATH); + } else { + return alarmMonitor; } } @@ -239,9 +244,6 @@ public List evaluate() { initialize(); recordIssues(); - // TODO: Search Task and Shard Alarm Should be seperated - // checkShardAlamr() and but first 2 ifs inside? - // checkTaskAlarms() and but last 2 ifs inside? checkShardAlarms(actions); checkTaskAlarms(actions); From 84c6b3b6ff15ea80eb90c08b8b54145a4958dcb4 Mon Sep 17 00:00:00 2001 From: CoderJeffrey Date: Thu, 27 Jul 2023 15:08:00 -0700 Subject: [PATCH 68/73] Change LOG to debug level for unnecessary info (Signed-off-by: Jeffrey Liu ujeffliu@amazon.com) Signed-off-by: CoderJeffrey --- .../SearchBackPressurePolicyConfig.java | 4 +- .../SearchBackPressureDecider.java | 8 ++-- .../SearchBackPressurePolicy.java | 40 +++++++------------ .../SearchBpActionsAlarmMonitor.java | 6 +-- 4 files changed, 24 insertions(+), 34 deletions(-) diff --git a/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/configs/searchbackpressure/SearchBackPressurePolicyConfig.java b/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/configs/searchbackpressure/SearchBackPressurePolicyConfig.java index 295ba9276..11f7593e6 100644 --- a/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/configs/searchbackpressure/SearchBackPressurePolicyConfig.java +++ b/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/configs/searchbackpressure/SearchBackPressurePolicyConfig.java @@ -56,7 +56,7 @@ public SearchBackPressurePolicyConfig(NestedConfig config) { config.getValue(), DEFAULT_HOUR_BREACH_THRESHOLD, Integer.class); - LOG.info( + LOG.debug( "SearchBackPressurePolicyConfig hour breach threshold is: {}", hourBreachThreshold.getValue()); @@ -66,7 +66,7 @@ public SearchBackPressurePolicyConfig(NestedConfig config) { config.getValue(), DEFAULT_SEARCHBP_HEAP_STEPSIZE_IN_PERCENTAGE, Double.class); - LOG.info( + LOG.debug( "searchbpHeapStepsizeInPercentage is {}", searchbpHeapStepsizeInPercentage.getValue()); } diff --git a/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/searchbackpressure/SearchBackPressureDecider.java b/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/searchbackpressure/SearchBackPressureDecider.java index 2290df673..68fbb3370 100644 --- a/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/searchbackpressure/SearchBackPressureDecider.java +++ b/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/searchbackpressure/SearchBackPressureDecider.java @@ -34,7 +34,7 @@ public SearchBackPressureDecider( super(evalIntervalSeconds, decisionFrequency); this.searchBackPressureClusterRCA = searchBackPressureClusterRCA; this.searchBackPressurePolicy = new SearchBackPressurePolicy(searchBackPressureClusterRCA); - LOG.info("SearchBackPressureDecider created#2"); + LOG.debug("SearchBackPressureDecider created"); } @Override @@ -44,7 +44,7 @@ public String name() { @Override public Decision operate() { - LOG.info( + LOG.debug( "SearchBackPressureDecider#2 operate() with currentIteration: {}", currentIteration); @@ -66,7 +66,7 @@ public Decision operate() { searchBackPressureActions.stream() .forEach( (action) -> { - LOG.info( + LOG.debug( "searchBackPressureActions details, threshold name: {}, dimension: {}, increase/decrease: {}, stepsize: {}", ((SearchBackPressureAction) action).getThresholdName(), ((SearchBackPressureAction) action).getDimension(), @@ -76,7 +76,7 @@ public Decision operate() { searchBackPressureActions.forEach(decision::addAction); - LOG.info("decision action size is {}", decision.getActions().size()); + LOG.debug("decision action size is {}", decision.getActions().size()); return decision; } diff --git a/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/searchbackpressure/SearchBackPressurePolicy.java b/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/searchbackpressure/SearchBackPressurePolicy.java index ac737b284..64bed8717 100644 --- a/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/searchbackpressure/SearchBackPressurePolicy.java +++ b/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/searchbackpressure/SearchBackPressurePolicy.java @@ -82,7 +82,6 @@ public SearchBackPressurePolicy( this.searchBackPressureShardHeapDecreaseAlarm = searchBackPressureShardHeapDecreaseAlarm; this.searchBackPressureTaskHeapIncreaseAlarm = searchBackPressureTaskHeapIncreaseAlarm; this.searchBackPressureTaskHeapDecreaseAlarm = searchBackPressureTaskHeapDecreaseAlarm; - LOG.info("SearchBackPressurePolicy#SearchBackPressurePolicy() initialized"); } public SearchBackPressurePolicy(SearchBackPressureClusterRCA searchBackPressureClusterRCA) { @@ -95,15 +94,11 @@ public SearchBackPressurePolicy(SearchBackPressureClusterRCA searchBackPressureC * @param issue an issue with the application */ private void record(HotResourceSummary issue) { - // TODO: change nested if into menaing methods like recordSearchBpShardjIssue() - // recordSearchBpTaskIssue() - - LOG.info("SearchBackPressurePolicy#record()"); if (HEAP_SEARCHBP_SHARD_SIGNALS.contains(issue.getResource())) { - LOG.info("Recording shard-level issue"); + LOG.debug("Recording shard-level issue"); recordSearchBackPressureShardIssue(issue); } else if (HEAP_SEARCHBP_TASK_SIGNALS.contains(issue.getResource())) { - LOG.info("Recording Task-Level issue"); + LOG.debug("Recording Task-Level issue"); recordSearchBackPressureTaskIssue(issue); } } @@ -111,13 +106,13 @@ private void record(HotResourceSummary issue) { private void recordSearchBackPressureShardIssue(HotResourceSummary issue) { // increase alarm for heap-related threshold (shard-level) if (issue.getMetaData() == SearchBackPressureRcaConfig.INCREASE_THRESHOLD_BY_JVM_STR) { - LOG.info("recording increase-level issue for shard"); + LOG.debug("recording increase-level issue for shard"); searchBackPressureShardHeapIncreaseAlarm.recordIssue(); } // decrease alarm for heap-related threshold (shard-level) if (issue.getMetaData() == SearchBackPressureRcaConfig.DECREASE_THRESHOLD_BY_JVM_STR) { - LOG.info("recording decrease-level issue for shard"); + LOG.debug("recording decrease-level issue for shard"); searchBackPressureShardHeapDecreaseAlarm.recordIssue(); } } @@ -125,24 +120,23 @@ private void recordSearchBackPressureShardIssue(HotResourceSummary issue) { private void recordSearchBackPressureTaskIssue(HotResourceSummary issue) { // increase alarm for heap-related threshold (task-level) if (issue.getMetaData() == SearchBackPressureRcaConfig.INCREASE_THRESHOLD_BY_JVM_STR) { - LOG.info("recording increase-level issue for task"); + LOG.debug("recording increase-level issue for task"); searchBackPressureTaskHeapIncreaseAlarm.recordIssue(); } // decrease alarm for heap-related threshold (task-level) if (issue.getMetaData() == SearchBackPressureRcaConfig.DECREASE_THRESHOLD_BY_JVM_STR) { - LOG.info("recording decrease-level issue for task"); + LOG.debug("recording decrease-level issue for task"); searchBackPressureTaskHeapDecreaseAlarm.recordIssue(); } } /** gathers and records all issues observed in the application */ private void recordIssues() { - LOG.info("SearchBackPressurePolicy#recordIssues()"); + LOG.debug("SearchBackPressurePolicy#recordIssues()"); if (searchBackPressureClusterRCA.getFlowUnits().isEmpty()) { - LOG.info( - "SearchBackPressurePolicy#recordIssues() No flow units in searchBackPressureClusterRCA"); + LOG.debug("No flow units in searchBackPressureClusterRCA"); return; } @@ -177,9 +171,7 @@ public boolean isTaskHeapThresholdTooLarge() { // create alarm monitor from config public SearchBpActionsAlarmMonitor createAlarmMonitor(Path persistenceBasePath) { - // LOG the policyConfig.getHourMonitorWindowSizeMinutes() BuketSize and dahy breanch - // threhsold - LOG.info( + LOG.debug( "createAlarmMonitor with hour window: {}, bucket size: {}, hour threshold: {}, stepsize: {}", policyConfig.getHourMonitorWindowSizeMinutes(), policyConfig.getHourMonitorBucketSizeMinutes(), @@ -199,7 +191,6 @@ public SearchBpActionsAlarmMonitor createAlarmMonitor(Path persistenceBasePath) // initalize all alarm monitors public void initialize() { - LOG.info("Initializing alarms with dummy path"); // initialize shard level alarm for resounce unit that suggests to increase jvm threshold searchBackPressureShardHeapIncreaseAlarm = initializeAlarmMonitor(searchBackPressureShardHeapIncreaseAlarm); @@ -228,7 +219,6 @@ private SearchBpActionsAlarmMonitor initializeAlarmMonitor( @Override public List evaluate() { - LOG.info("Evaluate() of SearchBackpressurePolicy started"); List actions = new ArrayList<>(); if (rcaConf == null || appContext == null) { LOG.error("rca conf/app context is null, return empty action list"); @@ -237,7 +227,7 @@ public List evaluate() { policyConfig = rcaConf.getDeciderConfig().getSearchBackPressurePolicyConfig(); if (!policyConfig.isEnabled()) { - LOG.info("SearchBackPressurePolicy is disabled"); + LOG.debug("SearchBackPressurePolicy is disabled"); return actions; } @@ -249,14 +239,14 @@ public List evaluate() { checkTaskAlarms(actions); // print current size of the actions - LOG.info("SearchBackPressurePolicy#evaluate() action size: {}", actions.size()); + LOG.debug("SearchBackPressurePolicy#evaluate() action size: {}", actions.size()); return actions; } private void checkShardAlarms(List actions) { if (isShardHeapThresholdTooSmall()) { - LOG.info("isShardHeapThresholdTooSmall action Added!"); + LOG.debug("isShardHeapThresholdTooSmall action Added"); actions.add( new SearchBackPressureAction( appContext, @@ -267,7 +257,7 @@ private void checkShardAlarms(List actions) { SearchBackPressureAction.SearchbpThresholdActionDirection.INCREASE, policyConfig.getSearchbpHeapStepsizeInPercentage())); } else if (isShardHeapThresholdTooLarge()) { - LOG.info("isShardHeapThresholdTooLarge action Added!"); + LOG.debug("isShardHeapThresholdTooLarge action Added"); actions.add( new SearchBackPressureAction( appContext, @@ -282,7 +272,7 @@ private void checkShardAlarms(List actions) { private void checkTaskAlarms(List actions) { if (isTaskHeapThresholdTooSmall()) { - LOG.info("isTaskHeapThresholdTooSmall action Added!"); + LOG.debug("isTaskHeapThresholdTooSmall action Added"); actions.add( new SearchBackPressureAction( appContext, @@ -293,7 +283,7 @@ private void checkTaskAlarms(List actions) { SearchBackPressureAction.SearchbpThresholdActionDirection.INCREASE, policyConfig.getSearchbpHeapStepsizeInPercentage())); } else if (isTaskHeapThresholdTooLarge()) { - LOG.info("isTaskHeapThresholdTooLarge action Added!"); + LOG.debug("isTaskHeapThresholdTooLarge action Added"); actions.add( new SearchBackPressureAction( appContext, diff --git a/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/searchbackpressure/SearchBpActionsAlarmMonitor.java b/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/searchbackpressure/SearchBpActionsAlarmMonitor.java index ce13ccbf0..af9998ee7 100644 --- a/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/searchbackpressure/SearchBpActionsAlarmMonitor.java +++ b/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/searchbackpressure/SearchBpActionsAlarmMonitor.java @@ -96,14 +96,14 @@ public SearchBpActionsAlarmMonitor() { @Override public void recordIssue(long timeStamp, double value) { SlidingWindowData dataPoint = new SlidingWindowData(timeStamp, value); - LOG.info("Search Backpressure Actions Alarm is recording a new issue at {}", timeStamp); + LOG.debug("Search Backpressure Actions Alarm is recording a new issue at {}", timeStamp); hourMonitor.next(dataPoint); } private void evaluateAlarm() { if (alarmHealthy) { if (hourMonitor.size() >= hourBreachThreshold) { - LOG.info( + LOG.debug( "Search Backpressure Actions Alarm is Unhealthy because hourMonitor.size() is {}, and threshold is {}", hourMonitor.size(), hourBreachThreshold); @@ -111,7 +111,7 @@ private void evaluateAlarm() { } } else { if (hourMonitor.size() == 0) { - LOG.info("SearchBackpressure Hour Monitor is now healthy for zero capacity"); + LOG.debug("SearchBackpressure Hour Monitor is now healthy for zero capacity"); alarmHealthy = true; } } From 630608a70d6683f6dfc4e3ce862891d16b334c43 Mon Sep 17 00:00:00 2001 From: CoderJeffrey Date: Thu, 27 Jul 2023 15:33:16 -0700 Subject: [PATCH 69/73] refactor shar/task issue (Signed-off-by: Jeffrey Liu ujeffliu@amazon.com) Signed-off-by: CoderJeffrey --- .../SearchBackPressurePolicy.java | 44 +++++++++---------- 1 file changed, 20 insertions(+), 24 deletions(-) diff --git a/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/searchbackpressure/SearchBackPressurePolicy.java b/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/searchbackpressure/SearchBackPressurePolicy.java index 64bed8717..f6b532146 100644 --- a/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/searchbackpressure/SearchBackPressurePolicy.java +++ b/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/searchbackpressure/SearchBackPressurePolicy.java @@ -95,39 +95,35 @@ public SearchBackPressurePolicy(SearchBackPressureClusterRCA searchBackPressureC */ private void record(HotResourceSummary issue) { if (HEAP_SEARCHBP_SHARD_SIGNALS.contains(issue.getResource())) { - LOG.debug("Recording shard-level issue"); - recordSearchBackPressureShardIssue(issue); - } else if (HEAP_SEARCHBP_TASK_SIGNALS.contains(issue.getResource())) { - LOG.debug("Recording Task-Level issue"); - recordSearchBackPressureTaskIssue(issue); + recordSearchBackPressureIssue(issue, true); } - } - private void recordSearchBackPressureShardIssue(HotResourceSummary issue) { - // increase alarm for heap-related threshold (shard-level) - if (issue.getMetaData() == SearchBackPressureRcaConfig.INCREASE_THRESHOLD_BY_JVM_STR) { - LOG.debug("recording increase-level issue for shard"); - searchBackPressureShardHeapIncreaseAlarm.recordIssue(); - } - - // decrease alarm for heap-related threshold (shard-level) - if (issue.getMetaData() == SearchBackPressureRcaConfig.DECREASE_THRESHOLD_BY_JVM_STR) { - LOG.debug("recording decrease-level issue for shard"); - searchBackPressureShardHeapDecreaseAlarm.recordIssue(); + if (HEAP_SEARCHBP_TASK_SIGNALS.contains(issue.getResource())) { + recordSearchBackPressureIssue(issue, false); } } - private void recordSearchBackPressureTaskIssue(HotResourceSummary issue) { - // increase alarm for heap-related threshold (task-level) + private void recordSearchBackPressureIssue(HotResourceSummary issue, boolean isShard) { + // increase alarm for heap-related threshold if (issue.getMetaData() == SearchBackPressureRcaConfig.INCREASE_THRESHOLD_BY_JVM_STR) { - LOG.debug("recording increase-level issue for task"); - searchBackPressureTaskHeapIncreaseAlarm.recordIssue(); + if (isShard) { + LOG.debug("recording increase-level issue for shard"); + searchBackPressureShardHeapIncreaseAlarm.recordIssue(); + } else { + LOG.debug("recording increase-level issue for task"); + searchBackPressureTaskHeapIncreaseAlarm.recordIssue(); + } } - // decrease alarm for heap-related threshold (task-level) + // decrease alarm for heap-related threshold if (issue.getMetaData() == SearchBackPressureRcaConfig.DECREASE_THRESHOLD_BY_JVM_STR) { - LOG.debug("recording decrease-level issue for task"); - searchBackPressureTaskHeapDecreaseAlarm.recordIssue(); + if (isShard) { + LOG.debug("recording decrease-level issue for shard"); + searchBackPressureShardHeapDecreaseAlarm.recordIssue(); + } else { + LOG.debug("recording decrease-level issue for task"); + searchBackPressureTaskHeapDecreaseAlarm.recordIssue(); + } } } From 570d48896bdc98d72fcb8143f12f90d0a9bf3981 Mon Sep 17 00:00:00 2001 From: CoderJeffrey Date: Thu, 27 Jul 2023 15:40:37 -0700 Subject: [PATCH 70/73] nit fix (Signed-off-by: Jeffrey Liu ujeffliu@amazon.com) Signed-off-by: CoderJeffrey --- .../searchbackpressure/SearchBpActionsAlarmMonitor.java | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/searchbackpressure/SearchBpActionsAlarmMonitor.java b/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/searchbackpressure/SearchBpActionsAlarmMonitor.java index af9998ee7..666bb3a02 100644 --- a/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/searchbackpressure/SearchBpActionsAlarmMonitor.java +++ b/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/searchbackpressure/SearchBpActionsAlarmMonitor.java @@ -24,8 +24,7 @@ public class SearchBpActionsAlarmMonitor implements AlarmMonitor { * if there are more than 30 bad resournce units in one hour, then the alarm shows a Unhealthy Signal */ - // TODO: Remove 3 for testing, replace with 30 - private static final int DEFAULT_HOUR_BREACH_THRESHOLD = 3; + private static final int DEFAULT_HOUR_BREACH_THRESHOLD = 30; private static final int DEFAULT_BUCKET_WINDOW_SIZE = 1; private static final String HOUR_PREFIX = "hour-"; From 82d9033908c271aaddb2c526daf043c17b420434 Mon Sep 17 00:00:00 2001 From: CoderJeffrey Date: Fri, 28 Jul 2023 11:53:22 -0700 Subject: [PATCH 71/73] Added JavaDoc for Searchbp Action (Signed-off-by: Jeffrey Liu ujeffliu@amazon.com) Signed-off-by: CoderJeffrey --- .../actions/SearchBackPressureAction.java | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/actions/SearchBackPressureAction.java b/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/actions/SearchBackPressureAction.java index 5f50c4e01..57381f5b8 100644 --- a/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/actions/SearchBackPressureAction.java +++ b/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/actions/SearchBackPressureAction.java @@ -84,9 +84,13 @@ public List impactedNodes() { .collect(Collectors.toList()); } - /* TO DO: Discuss the impact of SearchBackPressureAction - * since our action only modify the threhsold settings of Search Back Pressure Service instead of actual Resource - * No Impact should be put as the Impact Vector for this action so other actions would not be affected by Searchbp-specific actions + /* Search Back Pressure Decider/Policy only tunes searchbackpressure related thresholds (e.g. search_backpressure.search_task_heap_threshold) + * and it does not correlate directly with any current dimension in the ImpactVector (e.g. CPU/HEAP). + * And the current Searchbp actions only adjust heap related Searchbp Thresholds for now. + * Dimensions in ImpactVector is used by collator to determine which action should be emitted to Publisher, + * eventually which actions should the downstream class execute. So if there are 2 actions emitting in the same time, one increase CPU and one decrease it, the collator cancel out the actions. + * However, since for Searchbp Actions we only tune the searchbp threshold once per time (it's impossible for 2 actions emitting in the same time that increase and decrease searchbackpressure heap usage threshold). + * Therefore, we put no Impact for ImpactVector for Searchbp Actions. */ @Override public Map impact() { From c75103282f51535d1600b1f3809d334b33bbbde5 Mon Sep 17 00:00:00 2001 From: CoderJeffrey Date: Fri, 28 Jul 2023 16:17:30 -0700 Subject: [PATCH 72/73] SearchBackPressureIssue Interface created for refactor (Signed-off-by: Jeffrey Liu ujeffliu@amazon.com) Signed-off-by: CoderJeffrey --- .../SearchBackPressurePolicy.java | 96 +++++++++++++------ .../model/SearchBackPressureIssue.java | 30 ++++++ .../SearchBackPressureSearchTaskIssue.java | 68 +++++++++++++ .../model/SearchBackPressureShardIssue.java | 68 +++++++++++++ 4 files changed, 233 insertions(+), 29 deletions(-) create mode 100644 src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/searchbackpressure/model/SearchBackPressureIssue.java create mode 100644 src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/searchbackpressure/model/SearchBackPressureSearchTaskIssue.java create mode 100644 src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/searchbackpressure/model/SearchBackPressureShardIssue.java diff --git a/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/searchbackpressure/SearchBackPressurePolicy.java b/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/searchbackpressure/SearchBackPressurePolicy.java index f6b532146..a83ef0466 100644 --- a/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/searchbackpressure/SearchBackPressurePolicy.java +++ b/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/searchbackpressure/SearchBackPressurePolicy.java @@ -13,6 +13,7 @@ import java.nio.file.Path; import java.nio.file.Paths; import java.util.ArrayList; +import java.util.HashMap; import java.util.List; import java.util.concurrent.TimeUnit; import org.apache.logging.log4j.LogManager; @@ -22,8 +23,12 @@ import org.opensearch.performanceanalyzer.decisionmaker.actions.SearchBackPressureAction; import org.opensearch.performanceanalyzer.decisionmaker.deciders.DecisionPolicy; import org.opensearch.performanceanalyzer.decisionmaker.deciders.configs.searchbackpressure.SearchBackPressurePolicyConfig; +import org.opensearch.performanceanalyzer.decisionmaker.deciders.searchbackpressure.model.SearchBackPressureIssue; +import org.opensearch.performanceanalyzer.decisionmaker.deciders.searchbackpressure.model.SearchBackPressureSearchTaskIssue; +import org.opensearch.performanceanalyzer.decisionmaker.deciders.searchbackpressure.model.SearchBackPressureSearchTaskIssue.SearchbpTaskAlarmMonitorMapKeys; +import org.opensearch.performanceanalyzer.decisionmaker.deciders.searchbackpressure.model.SearchBackPressureShardIssue; +import org.opensearch.performanceanalyzer.decisionmaker.deciders.searchbackpressure.model.SearchBackPressureShardIssue.SearchbpShardAlarmMonitorMapKeys; import org.opensearch.performanceanalyzer.grpc.Resource; -import org.opensearch.performanceanalyzer.rca.configs.SearchBackPressureRcaConfig; import org.opensearch.performanceanalyzer.rca.framework.api.aggregators.BucketizedSlidingWindowConfig; import org.opensearch.performanceanalyzer.rca.framework.api.flow_units.ResourceFlowUnit; import org.opensearch.performanceanalyzer.rca.framework.api.summaries.HotClusterSummary; @@ -62,14 +67,18 @@ public class SearchBackPressurePolicy implements DecisionPolicy { static final List HEAP_SEARCHBP_TASK_SIGNALS = Lists.newArrayList(SEARCHBACKPRESSURE_TASK); + SearchBackPressureIssue searchBackPressureIssue; + /* alarm monitors per threshold */ // shard-level alarms @VisibleForTesting SearchBpActionsAlarmMonitor searchBackPressureShardHeapIncreaseAlarm; @VisibleForTesting SearchBpActionsAlarmMonitor searchBackPressureShardHeapDecreaseAlarm; + HashMap searchBackPressureShardAlarmMonitorMap; // task-level alarms @VisibleForTesting SearchBpActionsAlarmMonitor searchBackPressureTaskHeapIncreaseAlarm; @VisibleForTesting SearchBpActionsAlarmMonitor searchBackPressureTaskHeapDecreaseAlarm; + HashMap searchBackPressureTaskAlarmMonitorMap; public SearchBackPressurePolicy( SearchBackPressureClusterRCA searchBackPressureClusterRCA, @@ -93,39 +102,46 @@ public SearchBackPressurePolicy(SearchBackPressureClusterRCA searchBackPressureC * * @param issue an issue with the application */ - private void record(HotResourceSummary issue) { - if (HEAP_SEARCHBP_SHARD_SIGNALS.contains(issue.getResource())) { - recordSearchBackPressureIssue(issue, true); + private void record(HotResourceSummary summary) { + if (HEAP_SEARCHBP_SHARD_SIGNALS.contains(summary.getResource())) { + // recordSearchBackPressureIssue(issue, true); + searchBackPressureIssue = + new SearchBackPressureShardIssue( + summary, searchBackPressureShardAlarmMonitorMap); + searchBackPressureIssue.recordIssueBySummaryType(summary); } - if (HEAP_SEARCHBP_TASK_SIGNALS.contains(issue.getResource())) { - recordSearchBackPressureIssue(issue, false); + if (HEAP_SEARCHBP_TASK_SIGNALS.contains(summary.getResource())) { + searchBackPressureIssue = + new SearchBackPressureSearchTaskIssue( + summary, searchBackPressureTaskAlarmMonitorMap); + searchBackPressureIssue.recordIssueBySummaryType(summary); } } - private void recordSearchBackPressureIssue(HotResourceSummary issue, boolean isShard) { - // increase alarm for heap-related threshold - if (issue.getMetaData() == SearchBackPressureRcaConfig.INCREASE_THRESHOLD_BY_JVM_STR) { - if (isShard) { - LOG.debug("recording increase-level issue for shard"); - searchBackPressureShardHeapIncreaseAlarm.recordIssue(); - } else { - LOG.debug("recording increase-level issue for task"); - searchBackPressureTaskHeapIncreaseAlarm.recordIssue(); - } - } - - // decrease alarm for heap-related threshold - if (issue.getMetaData() == SearchBackPressureRcaConfig.DECREASE_THRESHOLD_BY_JVM_STR) { - if (isShard) { - LOG.debug("recording decrease-level issue for shard"); - searchBackPressureShardHeapDecreaseAlarm.recordIssue(); - } else { - LOG.debug("recording decrease-level issue for task"); - searchBackPressureTaskHeapDecreaseAlarm.recordIssue(); - } - } - } + // private void recordSearchBackPressureIssue(HotResourceSummary issue, boolean isShard) { + // // increase alarm for heap-related threshold + // if (issue.getMetaData() == SearchBackPressureRcaConfig.INCREASE_THRESHOLD_BY_JVM_STR) { + // if (isShard) { + // LOG.debug("recording increase-level issue for shard"); + // searchBackPressureShardHeapIncreaseAlarm.recordIssue(); + // } else { + // LOG.debug("recording increase-level issue for task"); + // searchBackPressureTaskHeapIncreaseAlarm.recordIssue(); + // } + // } + + // // decrease alarm for heap-related threshold + // if (issue.getMetaData() == SearchBackPressureRcaConfig.DECREASE_THRESHOLD_BY_JVM_STR) { + // if (isShard) { + // LOG.debug("recording decrease-level issue for shard"); + // searchBackPressureShardHeapDecreaseAlarm.recordIssue(); + // } else { + // LOG.debug("recording decrease-level issue for task"); + // searchBackPressureTaskHeapDecreaseAlarm.recordIssue(); + // } + // } + // } /** gathers and records all issues observed in the application */ private void recordIssues() { @@ -202,6 +218,8 @@ public void initialize() { // initialize task level alarm for resounce unit that suggests to decrease jvm threhsold searchBackPressureTaskHeapDecreaseAlarm = initializeAlarmMonitor(searchBackPressureTaskHeapDecreaseAlarm); + + initializeAlarmMonitorMap(); } private SearchBpActionsAlarmMonitor initializeAlarmMonitor( @@ -213,6 +231,26 @@ private SearchBpActionsAlarmMonitor initializeAlarmMonitor( } } + private void initializeAlarmMonitorMap() { + // add shard level monitors to shardAlarmMonitorMap + searchBackPressureShardAlarmMonitorMap = new HashMap(); + searchBackPressureShardAlarmMonitorMap.put( + SearchbpShardAlarmMonitorMapKeys.SHARD_HEAP_INCREASE_ALARM.toString(), + searchBackPressureShardHeapIncreaseAlarm); + searchBackPressureShardAlarmMonitorMap.put( + SearchbpShardAlarmMonitorMapKeys.SHARD_HEAP_DECREASE_ALARM.toString(), + searchBackPressureShardHeapDecreaseAlarm); + + // add task level monitors to taskAlarmMonitorMap + searchBackPressureTaskAlarmMonitorMap = new HashMap(); + searchBackPressureTaskAlarmMonitorMap.put( + SearchbpTaskAlarmMonitorMapKeys.TASK_HEAP_INCREASE_ALARM.toString(), + searchBackPressureTaskHeapIncreaseAlarm); + searchBackPressureTaskAlarmMonitorMap.put( + SearchbpTaskAlarmMonitorMapKeys.TASK_HEAP_DECREASE_ALARM.toString(), + searchBackPressureTaskHeapDecreaseAlarm); + } + @Override public List evaluate() { List actions = new ArrayList<>(); diff --git a/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/searchbackpressure/model/SearchBackPressureIssue.java b/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/searchbackpressure/model/SearchBackPressureIssue.java new file mode 100644 index 000000000..423ad429f --- /dev/null +++ b/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/searchbackpressure/model/SearchBackPressureIssue.java @@ -0,0 +1,30 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ + +package org.opensearch.performanceanalyzer.decisionmaker.deciders.searchbackpressure.model; + + +import java.util.HashMap; +import java.util.Map; +import org.opensearch.performanceanalyzer.decisionmaker.deciders.searchbackpressure.SearchBpActionsAlarmMonitor; +import org.opensearch.performanceanalyzer.rca.framework.api.summaries.HotResourceSummary; + +/* + * SearchBackPressureIssue is the interface for all types of SearchBackPressure Issue (e.g. issue caused by overflow of shard-level heap usage) + */ +public abstract class SearchBackPressureIssue { + public HotResourceSummary hotResourceSummary; + public Map actionsAlarmMonitorMap; + + // constructor + SearchBackPressureIssue( + HotResourceSummary hotResourceSummary, + HashMap actionsAlarmMonitorMap) { + this.hotResourceSummary = hotResourceSummary; + this.actionsAlarmMonitorMap = actionsAlarmMonitorMap; + } + + public abstract void recordIssueBySummaryType(HotResourceSummary hotResourceSummary); +} diff --git a/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/searchbackpressure/model/SearchBackPressureSearchTaskIssue.java b/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/searchbackpressure/model/SearchBackPressureSearchTaskIssue.java new file mode 100644 index 000000000..8169cc024 --- /dev/null +++ b/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/searchbackpressure/model/SearchBackPressureSearchTaskIssue.java @@ -0,0 +1,68 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ + +package org.opensearch.performanceanalyzer.decisionmaker.deciders.searchbackpressure.model; + + +import java.util.HashMap; +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; +import org.opensearch.performanceanalyzer.decisionmaker.deciders.searchbackpressure.SearchBpActionsAlarmMonitor; +import org.opensearch.performanceanalyzer.rca.configs.SearchBackPressureRcaConfig; +import org.opensearch.performanceanalyzer.rca.framework.api.summaries.HotResourceSummary; + +public class SearchBackPressureSearchTaskIssue extends SearchBackPressureIssue { + private static final Logger LOG = LogManager.getLogger(SearchBackPressureSearchTaskIssue.class); + + public SearchBackPressureSearchTaskIssue( + HotResourceSummary hotResourceSummary, + HashMap actionsAlarmMonitorMap) { + super(hotResourceSummary, actionsAlarmMonitorMap); + } + + @Override + public void recordIssueBySummaryType(HotResourceSummary summary) { + + if (summary.getMetaData() == SearchBackPressureRcaConfig.INCREASE_THRESHOLD_BY_JVM_STR) { + LOG.debug("recording increase-level issue for task"); + actionsAlarmMonitorMap + .get(SearchbpTaskAlarmMonitorMapKeys.TASK_HEAP_INCREASE_ALARM.toString()) + .recordIssue(); + } + + // decrease alarm for heap-related threshold + if (summary.getMetaData() == SearchBackPressureRcaConfig.DECREASE_THRESHOLD_BY_JVM_STR) { + LOG.debug("recording decrease-level issue for task"); + actionsAlarmMonitorMap + .get(SearchbpTaskAlarmMonitorMapKeys.TASK_HEAP_DECREASE_ALARM.toString()) + .recordIssue(); + } + } + + public enum SearchbpTaskAlarmMonitorMapKeys { + TASK_HEAP_INCREASE_ALARM( + SearchbpTaskAlarmMonitorMapKeys.Constants.TASK_HEAP_INCREASE_ALARM), + TASK_HEAP_DECREASE_ALARM( + SearchbpTaskAlarmMonitorMapKeys.Constants.TASK_HEAP_DECREASE_ALARM); + + private final String value; + + SearchbpTaskAlarmMonitorMapKeys(String value) { + this.value = value; + } + + @Override + public String toString() { + return value; + } + + public static class Constants { + public static final String TASK_HEAP_INCREASE_ALARM = + "searchBackPressureTaskHeapIncreaseAlarm"; + public static final String TASK_HEAP_DECREASE_ALARM = + "searchBackPressureTaskHeapDecreaseAlarm"; + } + } +} diff --git a/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/searchbackpressure/model/SearchBackPressureShardIssue.java b/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/searchbackpressure/model/SearchBackPressureShardIssue.java new file mode 100644 index 000000000..bb13e6b31 --- /dev/null +++ b/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/searchbackpressure/model/SearchBackPressureShardIssue.java @@ -0,0 +1,68 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ + +package org.opensearch.performanceanalyzer.decisionmaker.deciders.searchbackpressure.model; + + +import java.util.HashMap; +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; +import org.opensearch.performanceanalyzer.decisionmaker.deciders.searchbackpressure.SearchBpActionsAlarmMonitor; +import org.opensearch.performanceanalyzer.rca.configs.SearchBackPressureRcaConfig; +import org.opensearch.performanceanalyzer.rca.framework.api.summaries.HotResourceSummary; + +public class SearchBackPressureShardIssue extends SearchBackPressureIssue { + private static final Logger LOG = LogManager.getLogger(SearchBackPressureShardIssue.class); + + public SearchBackPressureShardIssue( + HotResourceSummary hotResourceSummary, + HashMap actionsAlarmMonitorMap) { + super(hotResourceSummary, actionsAlarmMonitorMap); + } + + @Override + public void recordIssueBySummaryType(HotResourceSummary summary) { + if (summary.getMetaData() == SearchBackPressureRcaConfig.INCREASE_THRESHOLD_BY_JVM_STR) { + LOG.debug("recording increase-level issue for shard"); + LOG.debug("size of the HashMap: {}", actionsAlarmMonitorMap.size()); + actionsAlarmMonitorMap + .get(SearchbpShardAlarmMonitorMapKeys.SHARD_HEAP_INCREASE_ALARM.toString()) + .recordIssue(); + } + + // decrease alarm for heap-related threshold + if (summary.getMetaData() == SearchBackPressureRcaConfig.DECREASE_THRESHOLD_BY_JVM_STR) { + LOG.debug("recording decrease-level issue for shard"); + actionsAlarmMonitorMap + .get(SearchbpShardAlarmMonitorMapKeys.SHARD_HEAP_DECREASE_ALARM.toString()) + .recordIssue(); + } + } + + public enum SearchbpShardAlarmMonitorMapKeys { + SHARD_HEAP_INCREASE_ALARM( + SearchbpShardAlarmMonitorMapKeys.Constants.SHARD_HEAP_DECREASE_ALARM), + SHARD_HEAP_DECREASE_ALARM( + SearchbpShardAlarmMonitorMapKeys.Constants.SHARD_HEAP_DECREASE_ALARM); + + private final String value; + + SearchbpShardAlarmMonitorMapKeys(String value) { + this.value = value; + } + + @Override + public String toString() { + return value; + } + + public static class Constants { + public static final String SHARD_HEAP_INCREASE_ALARM = + "searchBackPressureShardHeapIncreaseAlarm"; + public static final String SHARD_HEAP_DECREASE_ALARM = + "searchBackPressureShardHeapDecreaseAlarm"; + } + } +} From 1cf5fc66b937c470589038d3cc928ac0f2d51b30 Mon Sep 17 00:00:00 2001 From: CoderJeffrey Date: Wed, 2 Aug 2023 10:02:30 -0700 Subject: [PATCH 73/73] Remove dead comment (Signed-off-by: Jeffrey Liu ujeffliu@amazon.com) Signed-off-by: CoderJeffrey --- .../SearchBackPressurePolicy.java | 25 ------------------- 1 file changed, 25 deletions(-) diff --git a/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/searchbackpressure/SearchBackPressurePolicy.java b/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/searchbackpressure/SearchBackPressurePolicy.java index a83ef0466..67bd14db3 100644 --- a/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/searchbackpressure/SearchBackPressurePolicy.java +++ b/src/main/java/org/opensearch/performanceanalyzer/decisionmaker/deciders/searchbackpressure/SearchBackPressurePolicy.java @@ -104,7 +104,6 @@ public SearchBackPressurePolicy(SearchBackPressureClusterRCA searchBackPressureC */ private void record(HotResourceSummary summary) { if (HEAP_SEARCHBP_SHARD_SIGNALS.contains(summary.getResource())) { - // recordSearchBackPressureIssue(issue, true); searchBackPressureIssue = new SearchBackPressureShardIssue( summary, searchBackPressureShardAlarmMonitorMap); @@ -119,30 +118,6 @@ private void record(HotResourceSummary summary) { } } - // private void recordSearchBackPressureIssue(HotResourceSummary issue, boolean isShard) { - // // increase alarm for heap-related threshold - // if (issue.getMetaData() == SearchBackPressureRcaConfig.INCREASE_THRESHOLD_BY_JVM_STR) { - // if (isShard) { - // LOG.debug("recording increase-level issue for shard"); - // searchBackPressureShardHeapIncreaseAlarm.recordIssue(); - // } else { - // LOG.debug("recording increase-level issue for task"); - // searchBackPressureTaskHeapIncreaseAlarm.recordIssue(); - // } - // } - - // // decrease alarm for heap-related threshold - // if (issue.getMetaData() == SearchBackPressureRcaConfig.DECREASE_THRESHOLD_BY_JVM_STR) { - // if (isShard) { - // LOG.debug("recording decrease-level issue for shard"); - // searchBackPressureShardHeapDecreaseAlarm.recordIssue(); - // } else { - // LOG.debug("recording decrease-level issue for task"); - // searchBackPressureTaskHeapDecreaseAlarm.recordIssue(); - // } - // } - // } - /** gathers and records all issues observed in the application */ private void recordIssues() { LOG.debug("SearchBackPressurePolicy#recordIssues()");