Merge pull request #66 from paigerube14/cmr_2

CMR
cloud-bulldozer · Sep 20, 2024 · 2c9f287 · 2c9f287
2 parents 248893a + 9476a1f
commit 2c9f287
Show file tree

Hide file tree

Showing 10 changed files with 171 additions and 11 deletions.
diff --git a/.github/workflows/pylint.yaml b/.github/workflows/pylint.yaml
@@ -28,4 +28,4 @@ jobs:
 
     - name: Analysing the code with pylint
       run: |
-        pylint -d C0103 -d R0912 $(git ls-files '*/*.py' '*.py')
+        pylint -d C0103 -d R0912 -d R0917 $(git ls-files '*/*.py' '*.py')
diff --git a/README.md b/README.md
@@ -123,6 +123,11 @@ Additionally, users can specify a custom path for the output CSV file using the
 
 Orion now supports anomaly detection for your data. Use the ```--anomaly-detection``` command to start the anomaly detection process.
 
+
+To be able to find significant percent differences in workload runs, use the ```--cmr``` command. This will compare the most recent run with any previous matching runs or baseline UUIDs. If more than 1 other run is found from the most recent, the values will be meaned together and then compared with the previous run. Use with *direction: 0* (set in the config) when using ```-o json``` format to see percent differences
+
+![cmr percent difference](percentdiff.jpg)
+
 You can now constrain your look-back period using the ```--lookback``` option. The format for look-back is ```XdYh```, where X represents the number of days and Y represents the number of hours.
 
 To specify how many runs to look back, you can use the ```--lookback-size``` option. By default, this option is set to 10000.
@@ -156,7 +161,7 @@ This is similar to how car manufacturers warranty plays out such as 5years or 60
 
 You can open the match requirement by using the ```--node-count``` option to find any matching uuid based on the metadata and not have to have the same jobConfig.jobIterations. This variable is a ```True``` or ```False```, defaulted to False. 
 
-**_NOTE:_**  The ```--hunter-analyze``` and ```--anomaly-detection``` flags are mutually exclusive. They cannot be used together because they represent different algorithms designed for distinct use cases.
+**_NOTE:_**  The ```cmr```, ```--hunter-analyze``` and ```--anomaly-detection``` flags are mutually exclusive. They cannot be used together because they represent different algorithms designed for distinct use cases.
 
 ### Daemon mode
 The core purpose of Daemon mode is to operate Orion as a self-contained server, dedicated to handling incoming requests. By sending a POST request accompanied by a test name of predefined tests, users can trigger change point detection on the provided metadata and metrics. Following the processing, the response is formatted in JSON, providing a structured output for seamless integration and analysis. To trigger daemon mode just use the following commands

diff --git a/orion.py b/orion.py
@@ -69,6 +69,14 @@ def cli(max_content_width=120):  # pylint: disable=unused-argument
 
 # pylint: disable=too-many-locals
 @cli.command(name="cmd")
+@click.option(
+    "--cmr", 
+    is_flag=True,
+    help="Generate percent difference in comparison",
+    cls=MutuallyExclusiveOption,
+    mutually_exclusive=["anomaly_detection","hunter_analyze"],
+)
+@click.option("--filter", is_flag=True, help="Generate percent difference in comparison")
 @click.option("--config", default="config.yaml", help="Path to the configuration file")
 @click.option(
     "--save-data-path", default="data.csv", help="Path to save the output file"
@@ -79,7 +87,7 @@ def cli(max_content_width=120):  # pylint: disable=unused-argument
     is_flag=True,
     help="run hunter analyze",
     cls=MutuallyExclusiveOption,
-    mutually_exclusive=["anomaly_detection"],
+    mutually_exclusive=["anomaly_detection","cmr"],
 )
 @click.option("--anomaly-window", type=int, callback=validate_anomaly_options, help="set window size for moving average for anomaly-detection")
 @click.option("--min-anomaly-percent", type=int, callback=validate_anomaly_options, help="set minimum percentage difference from moving average for data point to be detected as anomaly")
@@ -88,7 +96,7 @@ def cli(max_content_width=120):  # pylint: disable=unused-argument
     is_flag=True,
     help="run anomaly detection algorithm powered by isolation forest",
     cls=MutuallyExclusiveOption,
-    mutually_exclusive=["hunter_analyze"],
+    mutually_exclusive=["hunter_analyze","cmr"],
 )
 @click.option(
     "-o",

diff --git a/percentdiff.jpg b/percentdiff.jpg
diff --git a/pkg/algorithms/algorithmFactory.py b/pkg/algorithms/algorithmFactory.py
@@ -6,6 +6,7 @@
 import pkg.constants as cnsts
 from .edivisive import EDivisive
 from .isolationforest import IsolationForestWeightedMean
+from .cmr import CMR
 
 
 class AlgorithmFactory: # pylint: disable= too-few-public-methods, too-many-arguments, line-too-long
@@ -30,4 +31,6 @@ def instantiate_algorithm(self, algorithm: str, matcher: Matcher, dataframe:pd.D
             return EDivisive(matcher, dataframe, test, options, metrics_config)
         if algorithm == cnsts.ISOLATION_FOREST:
             return IsolationForestWeightedMean(matcher, dataframe, test, options, metrics_config)
+        if algorithm == cnsts.CMR:
+            return CMR(matcher, dataframe, test, options, metrics_config)
         raise ValueError("Invalid algorithm called")
diff --git a/pkg/algorithms/cmr/__init__.py b/pkg/algorithms/cmr/__init__.py
@@ -0,0 +1,4 @@
+"""
+Init for CMR Algorithm
+"""
+from .cmr import CMR
diff --git a/pkg/algorithms/cmr/cmr.py b/pkg/algorithms/cmr/cmr.py
@@ -0,0 +1,108 @@
+"""CMR - Comparing Mean Responses Algorithm"""
+
+# pylint: disable = line-too-long
+import pandas as pd
+import numpy
+
+from fmatch.logrus import SingletonLogger
+from hunter.series import  ChangePoint, ComparativeStats
+from pkg.algorithms.algorithm import Algorithm
+
+
+class CMR(Algorithm):
+    """Implementation of the CMR algorithm
+    Will Combine metrics into 2 lines and compare with a tolerancy to set pass fail
+
+    Args:
+        Algorithm (Algorithm): Inherits
+    """
+
+
+    def _analyze(self):
+        """Analyze the dataframe with meaning any previous data and generate percent change with a current uuid
+
+        Returns:
+            series: data series that contains attributes and full dataframe
+            change_points_by_metric: list of ChangePoints
+        """
+        logger_instance = SingletonLogger.getLogger("Orion")
+        logger_instance.info("Starting analysis using CMR")
+        self.dataframe["timestamp"] = pd.to_datetime(self.dataframe["timestamp"])
+        self.dataframe["timestamp"] = self.dataframe["timestamp"].astype(int) // 10**9
+
+        if len(self.dataframe.index) == 1:
+            series= self.setup_series()
+            series.data = self.dataframe
+            return series, {}
+        # if larger than 2 rows, need to get the mean of 0 through -2
+        self.dataframe = self.combine_and_average_runs(self.dataframe)
+
+        series= self.setup_series()
+
+        df, change_points_by_metric = self.run_cmr(self.dataframe)
+        series.data= df
+        return series, change_points_by_metric
+
+
+    def run_cmr(self, dataframe_list: pd.DataFrame):
+        """
+        Generate the percent difference in a 2 row dataframe
+
+        Args:
+            dataframe_list (pd.DataFrame): data frame of all data to compare on
+
+        Returns:
+            pd.Dataframe, dict[metric_name, ChangePoint]: Returned data frame and change points
+        """
+        metric_columns = self.metrics_config.keys()
+        change_points_by_metric={ k:[] for k in metric_columns }
+
+        for column in metric_columns:
+
+            change_point = ChangePoint(metric=column,
+                                            index=1,
+                                            time=0,
+                                            stats=ComparativeStats(
+                                                mean_1=dataframe_list[column][0],
+                                                mean_2=dataframe_list[column][1],
+                                                std_1=0,
+                                                std_2=0,
+                                                pvalue=1
+                                            ))
+            change_points_by_metric[column].append(change_point)
+
+        # based on change point generate pass/fail
+        return dataframe_list, change_points_by_metric
+
+    def combine_and_average_runs(self, dataFrame: pd.DataFrame):
+        """
+        If more than 1 previous run, mean data together into 1 single row
+        Combine with current run into 1 data frame (current run being -1 index)
+
+        Args:
+            dataFrame (pd.DataFrame): data to combine into 2 rows
+
+        Returns:
+            pd.Dataframe: data frame of most recent run and averaged previous runs
+        """
+        i = 0
+
+        last_row = dataFrame.tail(1)
+        dF = dataFrame[:-1]
+        data2 = {}
+
+        metric_columns = list(dataFrame.columns)
+        for column in metric_columns:
+
+            if isinstance(dF.loc[0, column], (numpy.float64, numpy.int64)):
+                mean = dF[column].mean()
+                data2[column] = [mean]
+            else:
+                column_list = dF[column].tolist()
+                non_numeric_joined_list = ','.join(column_list)
+                data2[column] = [non_numeric_joined_list]
+            i += 1
+        df2 = pd.DataFrame(data2)
+
+        result = pd.concat([df2, last_row], ignore_index=True)
+        return result
diff --git a/pkg/constants.py b/pkg/constants.py
@@ -6,3 +6,4 @@
 JSON="json"
 TEXT="text"
 JUNIT="junit"
+CMR="cmr"
diff --git a/pkg/runTest.py b/pkg/runTest.py
@@ -9,7 +9,24 @@
 import pkg.constants as cnsts
 from pkg.utils import get_datasource, process_test, get_subtracted_timestamp
 
+def get_algorithm_type(kwargs):
+    """Switch Case of getting algorithm name
 
+    Args:
+        kwargs (dict): passed command line arguments
+
+    Returns:
+        str: algorithm name
+    """
+    if kwargs["hunter_analyze"]:
+        algorithm_name = cnsts.EDIVISIVE
+    elif kwargs["anomaly_detection"]:
+        algorithm_name = cnsts.ISOLATION_FOREST
+    elif kwargs['cmr']:
+        algorithm_name = cnsts.CMR
+    else:
+        algorithm_name = None
+    return algorithm_name
 
 def run(**kwargs: dict[str, Any]) -> dict[str, Any]: #pylint: disable = R0914
     """run method to start the tests
@@ -48,11 +65,8 @@ def run(**kwargs: dict[str, Any]) -> dict[str, Any]: #pylint: disable = R0914
         if fingerprint_matched_df is None:
             sys.exit(3) # No data present
 
-        if kwargs["hunter_analyze"]:
-            algorithm_name = cnsts.EDIVISIVE
-        elif kwargs["anomaly_detection"]:
-            algorithm_name = cnsts.ISOLATION_FOREST
-        else:
+        algorithm_name = get_algorithm_type(kwargs)
+        if algorithm_name is None:
             return None, None
 
         algorithmFactory = AlgorithmFactory()

diff --git a/pkg/utils.py b/pkg/utils.py
@@ -258,17 +258,34 @@ def process_test(
     shortener = pyshorteners.Shortener(timeout=10)
     merged_df["buildUrl"] = merged_df["uuid"].apply(
         lambda uuid: (
-            shortener.tinyurl.short(buildUrls[uuid])
+            shorten_url(shortener, buildUrls[uuid])
             if options["convert_tinyurl"]
             else buildUrls[uuid]
-        )  # pylint: disable = cell-var-from-loop
+        )
+
+        # pylint: disable = cell-var-from-loop
     )
     merged_df=merged_df.reset_index(drop=True)
     #save the dataframe
     output_file_path = f"{options['save_data_path'].split('.')[0]}-{test['name']}.csv"
     match.save_results(merged_df, csv_file_path=output_file_path)
     return merged_df, metrics_config
 
+def shorten_url(shortener: any, uuids: str) -> str:
+    """Shorten url if there is a list of buildUrls
+
+    Args:
+        shortener (any): shortener object to use tinyrl.short on
+        uuids (List[str]): List of uuids to shorten
+
+    Returns:
+        str: a combined string of shortened urls
+    """
+    short_url_list = []
+    for buildUrl in uuids.split(","):
+        short_url_list.append(shortener.tinyurl.short(buildUrl))
+    short_url = ','.join(short_url_list)
+    return short_url
 
 def get_metadata_with_uuid(uuid: str, match: Matcher) -> Dict[Any, Any]:
     """Gets metadata of the run from each test