Skip to content

Commit

Permalink
Merge pull request #66 from paigerube14/cmr_2
Browse files Browse the repository at this point in the history
CMR
  • Loading branch information
paigerube14 authored Sep 20, 2024
2 parents 248893a + 9476a1f commit 2c9f287
Show file tree
Hide file tree
Showing 10 changed files with 171 additions and 11 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/pylint.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -28,4 +28,4 @@ jobs:
- name: Analysing the code with pylint
run: |
pylint -d C0103 -d R0912 $(git ls-files '*/*.py' '*.py')
pylint -d C0103 -d R0912 -d R0917 $(git ls-files '*/*.py' '*.py')
7 changes: 6 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,11 @@ Additionally, users can specify a custom path for the output CSV file using the

Orion now supports anomaly detection for your data. Use the ```--anomaly-detection``` command to start the anomaly detection process.


To be able to find significant percent differences in workload runs, use the ```--cmr``` command. This will compare the most recent run with any previous matching runs or baseline UUIDs. If more than 1 other run is found from the most recent, the values will be meaned together and then compared with the previous run. Use with *direction: 0* (set in the config) when using ```-o json``` format to see percent differences

![cmr percent difference](percentdiff.jpg)

You can now constrain your look-back period using the ```--lookback``` option. The format for look-back is ```XdYh```, where X represents the number of days and Y represents the number of hours.

To specify how many runs to look back, you can use the ```--lookback-size``` option. By default, this option is set to 10000.
Expand Down Expand Up @@ -156,7 +161,7 @@ This is similar to how car manufacturers warranty plays out such as 5years or 60

You can open the match requirement by using the ```--node-count``` option to find any matching uuid based on the metadata and not have to have the same jobConfig.jobIterations. This variable is a ```True``` or ```False```, defaulted to False.

**_NOTE:_** The ```--hunter-analyze``` and ```--anomaly-detection``` flags are mutually exclusive. They cannot be used together because they represent different algorithms designed for distinct use cases.
**_NOTE:_** The ```cmr```, ```--hunter-analyze``` and ```--anomaly-detection``` flags are mutually exclusive. They cannot be used together because they represent different algorithms designed for distinct use cases.

### Daemon mode
The core purpose of Daemon mode is to operate Orion as a self-contained server, dedicated to handling incoming requests. By sending a POST request accompanied by a test name of predefined tests, users can trigger change point detection on the provided metadata and metrics. Following the processing, the response is formatted in JSON, providing a structured output for seamless integration and analysis. To trigger daemon mode just use the following commands
Expand Down
12 changes: 10 additions & 2 deletions orion.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,14 @@ def cli(max_content_width=120): # pylint: disable=unused-argument

# pylint: disable=too-many-locals
@cli.command(name="cmd")
@click.option(
"--cmr",
is_flag=True,
help="Generate percent difference in comparison",
cls=MutuallyExclusiveOption,
mutually_exclusive=["anomaly_detection","hunter_analyze"],
)
@click.option("--filter", is_flag=True, help="Generate percent difference in comparison")
@click.option("--config", default="config.yaml", help="Path to the configuration file")
@click.option(
"--save-data-path", default="data.csv", help="Path to save the output file"
Expand All @@ -79,7 +87,7 @@ def cli(max_content_width=120): # pylint: disable=unused-argument
is_flag=True,
help="run hunter analyze",
cls=MutuallyExclusiveOption,
mutually_exclusive=["anomaly_detection"],
mutually_exclusive=["anomaly_detection","cmr"],
)
@click.option("--anomaly-window", type=int, callback=validate_anomaly_options, help="set window size for moving average for anomaly-detection")
@click.option("--min-anomaly-percent", type=int, callback=validate_anomaly_options, help="set minimum percentage difference from moving average for data point to be detected as anomaly")
Expand All @@ -88,7 +96,7 @@ def cli(max_content_width=120): # pylint: disable=unused-argument
is_flag=True,
help="run anomaly detection algorithm powered by isolation forest",
cls=MutuallyExclusiveOption,
mutually_exclusive=["hunter_analyze"],
mutually_exclusive=["hunter_analyze","cmr"],
)
@click.option(
"-o",
Expand Down
Binary file added percentdiff.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
3 changes: 3 additions & 0 deletions pkg/algorithms/algorithmFactory.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import pkg.constants as cnsts
from .edivisive import EDivisive
from .isolationforest import IsolationForestWeightedMean
from .cmr import CMR


class AlgorithmFactory: # pylint: disable= too-few-public-methods, too-many-arguments, line-too-long
Expand All @@ -30,4 +31,6 @@ def instantiate_algorithm(self, algorithm: str, matcher: Matcher, dataframe:pd.D
return EDivisive(matcher, dataframe, test, options, metrics_config)
if algorithm == cnsts.ISOLATION_FOREST:
return IsolationForestWeightedMean(matcher, dataframe, test, options, metrics_config)
if algorithm == cnsts.CMR:
return CMR(matcher, dataframe, test, options, metrics_config)
raise ValueError("Invalid algorithm called")
4 changes: 4 additions & 0 deletions pkg/algorithms/cmr/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
"""
Init for CMR Algorithm
"""
from .cmr import CMR
108 changes: 108 additions & 0 deletions pkg/algorithms/cmr/cmr.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
"""CMR - Comparing Mean Responses Algorithm"""

# pylint: disable = line-too-long
import pandas as pd
import numpy

from fmatch.logrus import SingletonLogger
from hunter.series import ChangePoint, ComparativeStats
from pkg.algorithms.algorithm import Algorithm


class CMR(Algorithm):
"""Implementation of the CMR algorithm
Will Combine metrics into 2 lines and compare with a tolerancy to set pass fail
Args:
Algorithm (Algorithm): Inherits
"""


def _analyze(self):
"""Analyze the dataframe with meaning any previous data and generate percent change with a current uuid
Returns:
series: data series that contains attributes and full dataframe
change_points_by_metric: list of ChangePoints
"""
logger_instance = SingletonLogger.getLogger("Orion")
logger_instance.info("Starting analysis using CMR")
self.dataframe["timestamp"] = pd.to_datetime(self.dataframe["timestamp"])
self.dataframe["timestamp"] = self.dataframe["timestamp"].astype(int) // 10**9

if len(self.dataframe.index) == 1:
series= self.setup_series()
series.data = self.dataframe
return series, {}
# if larger than 2 rows, need to get the mean of 0 through -2
self.dataframe = self.combine_and_average_runs(self.dataframe)

series= self.setup_series()

df, change_points_by_metric = self.run_cmr(self.dataframe)
series.data= df
return series, change_points_by_metric


def run_cmr(self, dataframe_list: pd.DataFrame):
"""
Generate the percent difference in a 2 row dataframe
Args:
dataframe_list (pd.DataFrame): data frame of all data to compare on
Returns:
pd.Dataframe, dict[metric_name, ChangePoint]: Returned data frame and change points
"""
metric_columns = self.metrics_config.keys()
change_points_by_metric={ k:[] for k in metric_columns }

for column in metric_columns:

change_point = ChangePoint(metric=column,
index=1,
time=0,
stats=ComparativeStats(
mean_1=dataframe_list[column][0],
mean_2=dataframe_list[column][1],
std_1=0,
std_2=0,
pvalue=1
))
change_points_by_metric[column].append(change_point)

# based on change point generate pass/fail
return dataframe_list, change_points_by_metric

def combine_and_average_runs(self, dataFrame: pd.DataFrame):
"""
If more than 1 previous run, mean data together into 1 single row
Combine with current run into 1 data frame (current run being -1 index)
Args:
dataFrame (pd.DataFrame): data to combine into 2 rows
Returns:
pd.Dataframe: data frame of most recent run and averaged previous runs
"""
i = 0

last_row = dataFrame.tail(1)
dF = dataFrame[:-1]
data2 = {}

metric_columns = list(dataFrame.columns)
for column in metric_columns:

if isinstance(dF.loc[0, column], (numpy.float64, numpy.int64)):
mean = dF[column].mean()
data2[column] = [mean]
else:
column_list = dF[column].tolist()
non_numeric_joined_list = ','.join(column_list)
data2[column] = [non_numeric_joined_list]
i += 1
df2 = pd.DataFrame(data2)

result = pd.concat([df2, last_row], ignore_index=True)
return result
1 change: 1 addition & 0 deletions pkg/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,3 +6,4 @@
JSON="json"
TEXT="text"
JUNIT="junit"
CMR="cmr"
24 changes: 19 additions & 5 deletions pkg/runTest.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,24 @@
import pkg.constants as cnsts
from pkg.utils import get_datasource, process_test, get_subtracted_timestamp

def get_algorithm_type(kwargs):
"""Switch Case of getting algorithm name
Args:
kwargs (dict): passed command line arguments
Returns:
str: algorithm name
"""
if kwargs["hunter_analyze"]:
algorithm_name = cnsts.EDIVISIVE
elif kwargs["anomaly_detection"]:
algorithm_name = cnsts.ISOLATION_FOREST
elif kwargs['cmr']:
algorithm_name = cnsts.CMR
else:
algorithm_name = None
return algorithm_name

def run(**kwargs: dict[str, Any]) -> dict[str, Any]: #pylint: disable = R0914
"""run method to start the tests
Expand Down Expand Up @@ -48,11 +65,8 @@ def run(**kwargs: dict[str, Any]) -> dict[str, Any]: #pylint: disable = R0914
if fingerprint_matched_df is None:
sys.exit(3) # No data present

if kwargs["hunter_analyze"]:
algorithm_name = cnsts.EDIVISIVE
elif kwargs["anomaly_detection"]:
algorithm_name = cnsts.ISOLATION_FOREST
else:
algorithm_name = get_algorithm_type(kwargs)
if algorithm_name is None:
return None, None

algorithmFactory = AlgorithmFactory()
Expand Down
21 changes: 19 additions & 2 deletions pkg/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -258,17 +258,34 @@ def process_test(
shortener = pyshorteners.Shortener(timeout=10)
merged_df["buildUrl"] = merged_df["uuid"].apply(
lambda uuid: (
shortener.tinyurl.short(buildUrls[uuid])
shorten_url(shortener, buildUrls[uuid])
if options["convert_tinyurl"]
else buildUrls[uuid]
) # pylint: disable = cell-var-from-loop
)

# pylint: disable = cell-var-from-loop
)
merged_df=merged_df.reset_index(drop=True)
#save the dataframe
output_file_path = f"{options['save_data_path'].split('.')[0]}-{test['name']}.csv"
match.save_results(merged_df, csv_file_path=output_file_path)
return merged_df, metrics_config

def shorten_url(shortener: any, uuids: str) -> str:
"""Shorten url if there is a list of buildUrls
Args:
shortener (any): shortener object to use tinyrl.short on
uuids (List[str]): List of uuids to shorten
Returns:
str: a combined string of shortened urls
"""
short_url_list = []
for buildUrl in uuids.split(","):
short_url_list.append(shortener.tinyurl.short(buildUrl))
short_url = ','.join(short_url_list)
return short_url

def get_metadata_with_uuid(uuid: str, match: Matcher) -> Dict[Any, Any]:
"""Gets metadata of the run from each test
Expand Down

0 comments on commit 2c9f287

Please sign in to comment.