From d56fddd64fff98122c44cc2eaaec9c3e260c6555 Mon Sep 17 00:00:00 2001
From: Shiyu Wang <42013343+shi-yu-wang@users.noreply.github.com>
Date: Wed, 19 Jun 2024 21:05:55 -0700
Subject: [PATCH] Monthly windstats (#172)

* windstats with monthly features

Add post rules feature to windstats runner.

* windstats runner comments

* Add new feature to return only anomaly scores

* re-check pr

* Update numpy version

Avoid using numpy v2.0 to avoid version conflicts. numpy v2.0 was published on 2024-6-17.

* numpy version

Keep numpy version under v2.0, which was released on 2024-06-17 and will lead to conflict.
---
 merlion/models/anomaly/windstats_monthly.py |  8 +--
 merlion/models/anomaly/windstats_run.py     | 67 +++++++++++++++------
 setup.py                                    |  2 +-
 3 files changed, 55 insertions(+), 22 deletions(-)

diff --git a/merlion/models/anomaly/windstats_monthly.py b/merlion/models/anomaly/windstats_monthly.py
index 452819785..29f59aa50 100644
--- a/merlion/models/anomaly/windstats_monthly.py
+++ b/merlion/models/anomaly/windstats_monthly.py
@@ -22,7 +22,7 @@
 logger = logging.getLogger(__name__)
 
 
-class WindStatsConfig(DetectorConfig):
+class MonthlyWindStatsConfig(DetectorConfig):
     """
     Config class for `WindStats`.
     """
@@ -61,15 +61,15 @@ class MonthlyWindStats(DetectorBase):
     minimum of the scores is returned.
     """
 
-    config_class = WindStatsConfig
+    config_class = MonthlyWindStatsConfig
 
-    def __init__(self, config: WindStatsConfig = None):
+    def __init__(self, config: MonthlyWindStatsConfig = None):
         """
         config.wind_sz: the window size in minutes, default is 30 minute window
         config.max_days: maximum number of days stored in memory (only mean and std of each window are stored), default is 4 days
         here the days are first bucketized and then bucketized by window id.
         """
-        super().__init__(WindStatsConfig() if config is None else config)
+        super().__init__(MonthlyWindStatsConfig() if config is None else config)
         self.table = {}
 
     @property
diff --git a/merlion/models/anomaly/windstats_run.py b/merlion/models/anomaly/windstats_run.py
index 3ca257d7a..65d0c606b 100644
--- a/merlion/models/anomaly/windstats_run.py
+++ b/merlion/models/anomaly/windstats_run.py
@@ -5,50 +5,83 @@
 For the implementation of only weekly/monthly seasonality, specify "enable_weekly" of "enable_monthly" arguments of RunWindStats().
 """
 
-from windstats import WindStats, WindStatsConfig
-from windstats_monthly import MonthlyWindStats, MonthlyWindStatsConfig
-from ts_datasets.anomaly import NAB
+from merlion.models.anomaly.windstats import WindStats, WindStatsConfig
+from merlion.models.anomaly.windstats_monthly import MonthlyWindStats, MonthlyWindStatsConfig
 from merlion.utils import TimeSeries
-from merlion.post_process.threshold import AggregateAlarms
 
 class RunWindStats:
-    def __init__(self, threshold, enable_weekly = True, enable_monthly = True, WeeklyWindStatsConfig = WindStatsConfig(), MonthlyWindStatsConfig = MonthlyWindStatsConfig()):
+    def __init__(
+            self, 
+            threshold,
+            enable_weekly = True, 
+            enable_monthly = True, 
+            post_rule_on_anom_score = False, 
+            WeeklyWindStatsConfig = WindStatsConfig(), 
+            MonthlyWindStatsConfig = MonthlyWindStatsConfig(),
+            return_score = True
+    ):
         """
         Users can customize the configuration for weekly or monthly-based windstats. If not, then the default configuration will apply.
         """
                 
         self.enable_weekly = enable_weekly
         self.enable_monthly = enable_monthly
+        self.return_score = return_score
         assert self.enable_weekly == True or self.enable_monthly == True, "Must enable either weekly or monthly seasonality, or both!"
         
         # Threshold on identifying anomaly based on anomaly score.
         self.threshold = threshold
+        # If apply post rules on anomaly score
+        self.post_rule = post_rule_on_anom_score
         
+        # Intialize according model if enable weekly/monthly analysis
         if self.enable_weekly:
             self.model_weekly  = WindStats(WeeklyWindStatsConfig)
-            
         if self.enable_monthly:
             self.model_monthly = MonthlyWindStats(MonthlyWindStatsConfig)
 
+    # Identify anomaly based on the hard threshold.
     def anomalyByScore(self, scores, threshold):
-        scores.loc[abs(scores["anom_score"]) <= threshold] = 0
-        scores.loc[abs(scores["anom_score"]) > threshold] = 1
+        labels = scores.copy()
+        labels.loc[abs(labels["anom_score"]) <= threshold] = 0
+        labels.loc[abs(labels["anom_score"]) > threshold] = 1
         
-        scores.rename(columns = {"anom_score": "anomaly"}, inplace = True)
-        return scores
+        labels.rename(columns = {"anom_score": "anomaly"}, inplace = True)
+        return labels
+    
+    # Filter anomaly scores based on post rules. Same as "get_anomaly_label" in WindStats
+    def get_anomaly_label(self, model, ts):
+        scores = model.train(ts)
+        return model.post_rule(scores) if model.post_rule is not None else scores
     
     def run(self, ts):
         if self.enable_weekly:
-            scores_weekly = self.model_weekly.train(ts).to_pd()
-            scores_weekly = self.anomalyByScore(scores_weekly, self.threshold)
+            if self.post_rule:
+                scores_weekly = self.get_anomaly_label(self.model_weekly, ts).to_pd()
+            else:
+                scores_weekly = self.model_weekly.train(ts).to_pd()
+            labels_weekly = self.anomalyByScore(scores_weekly, self.threshold)
         
         if self.enable_monthly:
-            scores_monthly = self.model_monthly.train(ts).to_pd()
-            scores_monthly = self.anomalyByScore(scores_monthly, self.threshold)
+            if self.post_rule:
+                scores_monthly = self.get_anomaly_label(self.model_monthly, ts).to_pd()
+            else:
+                scores_monthly = self.model_monthly.train(ts).to_pd()
+            labels_monthly = self.anomalyByScore(scores_monthly, self.threshold)
             
+        # Anomaly is identified if and only if it's detected in both weekly and monthly patterns.
         if self.enable_weekly and self.enable_monthly:
-            return scores_weekly * scores_monthly
+            if self.return_score:
+                return scores_weekly, scores_monthly, scores_weekly * scores_monthly
+            else:
+                return scores_weekly, scores_monthly, labels_weekly * labels_monthly
         elif self.enable_weekly:
-            return scores_weekly
+            if self.return_score:
+                return scores_weekly, None, scores_weekly
+            else:
+                return scores_weekly, None, labels_weekly
         else:
-            return scores_monthly
+            if self.return_score:
+                return None, scores_monthly, scores_monthly
+            else:
+                return None, scores_monthly, labels_monthly
diff --git a/setup.py b/setup.py
index 16a606e86..beee10cb1 100644
--- a/setup.py
+++ b/setup.py
@@ -67,7 +67,7 @@ def read_file(fname):
         "py4j",
         "matplotlib",
         "plotly>=4.13",
-        "numpy>=1.21",  # 1.21 remediates a security risk
+        "numpy>=1.21,<2.0",  # 1.21 remediates a security risk
         "packaging",
         "pandas>=1.1.0",  # >=1.1.0 for origin kwarg to df.resample()
         "prophet>=1.1",  # 1.1 removes dependency on pystan