From d56fddd64fff98122c44cc2eaaec9c3e260c6555 Mon Sep 17 00:00:00 2001 From: Shiyu Wang <42013343+shi-yu-wang@users.noreply.github.com> Date: Wed, 19 Jun 2024 21:05:55 -0700 Subject: [PATCH] Monthly windstats (#172) * windstats with monthly features Add post rules feature to windstats runner. * windstats runner comments * Add new feature to return only anomaly scores * re-check pr * Update numpy version Avoid using numpy v2.0 to avoid version conflicts. numpy v2.0 was published on 2024-6-17. * numpy version Keep numpy version under v2.0, which was released on 2024-06-17 and will lead to conflict. --- merlion/models/anomaly/windstats_monthly.py | 8 +-- merlion/models/anomaly/windstats_run.py | 67 +++++++++++++++------ setup.py | 2 +- 3 files changed, 55 insertions(+), 22 deletions(-) diff --git a/merlion/models/anomaly/windstats_monthly.py b/merlion/models/anomaly/windstats_monthly.py index 452819785..29f59aa50 100644 --- a/merlion/models/anomaly/windstats_monthly.py +++ b/merlion/models/anomaly/windstats_monthly.py @@ -22,7 +22,7 @@ logger = logging.getLogger(__name__) -class WindStatsConfig(DetectorConfig): +class MonthlyWindStatsConfig(DetectorConfig): """ Config class for `WindStats`. """ @@ -61,15 +61,15 @@ class MonthlyWindStats(DetectorBase): minimum of the scores is returned. """ - config_class = WindStatsConfig + config_class = MonthlyWindStatsConfig - def __init__(self, config: WindStatsConfig = None): + def __init__(self, config: MonthlyWindStatsConfig = None): """ config.wind_sz: the window size in minutes, default is 30 minute window config.max_days: maximum number of days stored in memory (only mean and std of each window are stored), default is 4 days here the days are first bucketized and then bucketized by window id. """ - super().__init__(WindStatsConfig() if config is None else config) + super().__init__(MonthlyWindStatsConfig() if config is None else config) self.table = {} @property diff --git a/merlion/models/anomaly/windstats_run.py b/merlion/models/anomaly/windstats_run.py index 3ca257d7a..65d0c606b 100644 --- a/merlion/models/anomaly/windstats_run.py +++ b/merlion/models/anomaly/windstats_run.py @@ -5,50 +5,83 @@ For the implementation of only weekly/monthly seasonality, specify "enable_weekly" of "enable_monthly" arguments of RunWindStats(). """ -from windstats import WindStats, WindStatsConfig -from windstats_monthly import MonthlyWindStats, MonthlyWindStatsConfig -from ts_datasets.anomaly import NAB +from merlion.models.anomaly.windstats import WindStats, WindStatsConfig +from merlion.models.anomaly.windstats_monthly import MonthlyWindStats, MonthlyWindStatsConfig from merlion.utils import TimeSeries -from merlion.post_process.threshold import AggregateAlarms class RunWindStats: - def __init__(self, threshold, enable_weekly = True, enable_monthly = True, WeeklyWindStatsConfig = WindStatsConfig(), MonthlyWindStatsConfig = MonthlyWindStatsConfig()): + def __init__( + self, + threshold, + enable_weekly = True, + enable_monthly = True, + post_rule_on_anom_score = False, + WeeklyWindStatsConfig = WindStatsConfig(), + MonthlyWindStatsConfig = MonthlyWindStatsConfig(), + return_score = True + ): """ Users can customize the configuration for weekly or monthly-based windstats. If not, then the default configuration will apply. """ self.enable_weekly = enable_weekly self.enable_monthly = enable_monthly + self.return_score = return_score assert self.enable_weekly == True or self.enable_monthly == True, "Must enable either weekly or monthly seasonality, or both!" # Threshold on identifying anomaly based on anomaly score. self.threshold = threshold + # If apply post rules on anomaly score + self.post_rule = post_rule_on_anom_score + # Intialize according model if enable weekly/monthly analysis if self.enable_weekly: self.model_weekly = WindStats(WeeklyWindStatsConfig) - if self.enable_monthly: self.model_monthly = MonthlyWindStats(MonthlyWindStatsConfig) + # Identify anomaly based on the hard threshold. def anomalyByScore(self, scores, threshold): - scores.loc[abs(scores["anom_score"]) <= threshold] = 0 - scores.loc[abs(scores["anom_score"]) > threshold] = 1 + labels = scores.copy() + labels.loc[abs(labels["anom_score"]) <= threshold] = 0 + labels.loc[abs(labels["anom_score"]) > threshold] = 1 - scores.rename(columns = {"anom_score": "anomaly"}, inplace = True) - return scores + labels.rename(columns = {"anom_score": "anomaly"}, inplace = True) + return labels + + # Filter anomaly scores based on post rules. Same as "get_anomaly_label" in WindStats + def get_anomaly_label(self, model, ts): + scores = model.train(ts) + return model.post_rule(scores) if model.post_rule is not None else scores def run(self, ts): if self.enable_weekly: - scores_weekly = self.model_weekly.train(ts).to_pd() - scores_weekly = self.anomalyByScore(scores_weekly, self.threshold) + if self.post_rule: + scores_weekly = self.get_anomaly_label(self.model_weekly, ts).to_pd() + else: + scores_weekly = self.model_weekly.train(ts).to_pd() + labels_weekly = self.anomalyByScore(scores_weekly, self.threshold) if self.enable_monthly: - scores_monthly = self.model_monthly.train(ts).to_pd() - scores_monthly = self.anomalyByScore(scores_monthly, self.threshold) + if self.post_rule: + scores_monthly = self.get_anomaly_label(self.model_monthly, ts).to_pd() + else: + scores_monthly = self.model_monthly.train(ts).to_pd() + labels_monthly = self.anomalyByScore(scores_monthly, self.threshold) + # Anomaly is identified if and only if it's detected in both weekly and monthly patterns. if self.enable_weekly and self.enable_monthly: - return scores_weekly * scores_monthly + if self.return_score: + return scores_weekly, scores_monthly, scores_weekly * scores_monthly + else: + return scores_weekly, scores_monthly, labels_weekly * labels_monthly elif self.enable_weekly: - return scores_weekly + if self.return_score: + return scores_weekly, None, scores_weekly + else: + return scores_weekly, None, labels_weekly else: - return scores_monthly + if self.return_score: + return None, scores_monthly, scores_monthly + else: + return None, scores_monthly, labels_monthly diff --git a/setup.py b/setup.py index 16a606e86..beee10cb1 100644 --- a/setup.py +++ b/setup.py @@ -67,7 +67,7 @@ def read_file(fname): "py4j", "matplotlib", "plotly>=4.13", - "numpy>=1.21", # 1.21 remediates a security risk + "numpy>=1.21,<2.0", # 1.21 remediates a security risk "packaging", "pandas>=1.1.0", # >=1.1.0 for origin kwarg to df.resample() "prophet>=1.1", # 1.1 removes dependency on pystan