Refactored examples. Fixed evaluation comments. int requiremnet for y…

…_pred changed to float.
selimfirat · Aug 15, 2020 · 9827eb4 · 9827eb4
1 parent 0d047b7
commit 9827eb4
Show file tree

Hide file tree

Showing 8 changed files with 77 additions and 56 deletions.
diff --git a/examples/example_ensemble.py b/examples/example_ensemble.py
@@ -1,36 +1,41 @@
-from sklearn.utils import shuffle
-from tqdm import tqdm
+# Import modules.
 from pysad.evaluation import AUROCMetric
 from pysad.models import LODA
 from pysad.models import xStream
 from pysad.utils import ArrayStreamer
 from pysad.transform.ensemble import AverageScoreEnsembler
 from pysad.utils import Data
+from sklearn.utils import shuffle
+from tqdm import tqdm
 import numpy as np
 
 # This example demonstrates the usage of an ensembling method.
 if __name__ == '__main__':
-    np.random.seed(61)
+    np.random.seed(61)  # Fix random seed.
 
     data = Data("data")
-    X_all, y_all = data.get_data("arrhythmia.mat")
-    X_all, y_all = shuffle(X_all, y_all)
-    iterator = ArrayStreamer(shuffle=False)
-    auroc = AUROCMetric()
+    X_all, y_all = data.get_data("arrhythmia.mat")  # Load Aryhytmia data.
+    X_all, y_all = shuffle(X_all, y_all)  # Shuffle data.
+    iterator = ArrayStreamer(shuffle=False)  # Create streamer to simulate streaming data.
+    auroc = AUROCMetric()  # Tracker of area under receiver-operating- characteristics curve metric.
 
-    models = [
+    models = [  # Models to be ensembled.
         xStream(),
         LODA()
     ]
-    ensembler = AverageScoreEnsembler()
+    ensembler = AverageScoreEnsembler()  # Ensembler module.
 
-    for X, y in tqdm(iterator.iter(X_all[100:], y_all[100:])):
+    for X, y in tqdm(iterator.iter(X_all, y_all)):  # Iterate over examples.
         model_scores = np.empty(len(models), dtype=np.float)
+
+        # Fit & Score via for each model.
         for i, model in enumerate(models):
             model.fit_partial(X)
             model_scores[i] = model.score_partial(X)
 
-        score = ensembler.fit_transform_partial(model_scores)
-        auroc.update(y, score)
+        score = ensembler.fit_transform_partial(model_scores)  # fit to ensembler model and get ensembled score.
+
+        auroc.update(y, score)  # update AUROC metric.
 
+    # Output score.
     print("AUROC: ", auroc.get())
diff --git a/examples/example_probability_calibration.py b/examples/example_probability_calibration.py
@@ -1,17 +1,22 @@
+# Import modules.
 from pysad.models import xStream
 from pysad.transform.probability_calibration import ConformalProbabilityCalibrator
 from pysad.utils import Data
+import numpy as np
 
 # This example demonstrates the usage of the probability calibrators.
 if __name__ == "__main__":
-    model = xStream()
-    calibrator = ConformalProbabilityCalibrator(windowed=True, window_size=300)
-    streaming_data = Data().get_iterator("arrhythmia.mat")
+    np.random.seed(61)  # Fix seed.
 
-    for i, (x, y_true) in enumerate(streaming_data):
-        anomaly_score = model.fit_score_partial(x)
+    model = xStream()  # Init model.
+    calibrator = ConformalProbabilityCalibrator(windowed=True, window_size=300) # Init probability calibrator.
+    streaming_data = Data().get_iterator("arrhythmia.mat") # Get streamer.
 
-        calibrated_score = calibrator.fit_transform(anomaly_score)
-        print(calibrated_score)
-        if calibrated_score < 0.05: # ıf probabability is less than 5%.
+    for i, (x, y_true) in enumerate(streaming_data):  # Stream data.
+        anomaly_score = model.fit_score_partial(x)  # Fit to an instance x and score it.
+
+        calibrated_score = calibrator.fit_transform(anomaly_score)  # Fit & calibrate score.
+
+        # Output if the instance is anomalous.
+        if calibrated_score < 0.05:  # If probability is less than 5%.
             print(f"Alert: {i}th data point is anomalous.")
diff --git a/examples/example_pyod_integration.py b/examples/example_pyod_integration.py
@@ -1,3 +1,4 @@
+# Import modules.
 from pyod.models.iforest import IForest
 from sklearn.utils import shuffle
 from pysad.evaluation import AUROCMetric
@@ -7,26 +8,27 @@
 from tqdm import tqdm
 import numpy as np
 
-# This example demonstrates the integration of a PYOD model via ReferenceWindowModel.
+# This example demonstrates the integration of a PyOD model via ReferenceWindowModel.
 if __name__ == "__main__":
-    np.random.seed(61)
+    np.random.seed(61)  # Fix seed.
+
+    # Get data to stream.
     data = Data("data")
     X_all, y_all = data.get_data("arrhythmia.mat")
     X_all, y_all = shuffle(X_all, y_all)
+    iterator = ArrayStreamer(shuffle=False)
 
+    # Fit reference window integration to first 100 instances initially.
     model = ReferenceWindowModel(model_cls=IForest, window_size=240, sliding_size=30, initial_window_X=X_all[:100])
 
-    iterator = ArrayStreamer(shuffle=False)
-
-    auroc = AUROCMetric()
+    auroc = AUROCMetric()  # Init area under receiver-operating-characteristics curve metric tracker.
 
-    y_pred = []
     for X, y in tqdm(iterator.iter(X_all[100:], y_all[100:])):
-        model.fit_partial(X)
-        score = model.score_partial(X)
 
-        y_pred.append(score)
+        model.fit_partial(X)  # Fit to the instance.
+        score = model.score_partial(X)  # Score the instance.
 
-        auroc.update(y, score)
+        auroc.update(y, score)  # Update the metric.
 
+    # Output AUROC metric.
     print("AUROC: ", auroc.get())
diff --git a/examples/example_statistics.py b/examples/example_statistics.py
@@ -1,18 +1,23 @@
-import numpy as np
-
+# Import modules.
 from pysad.statistics import AverageMeter
 from pysad.statistics import VarianceMeter
+import numpy as np
 
 # This example shows the usage of statistics module for streaming data.
 if __name__ == '__main__':
 
+    # Init data with mean 0 and standard deviation 1.
     X = np.random.randn(1000)
+
+    # Init statistics trackers for mean and variance.
     avg_meter = AverageMeter()
     var_meter = VarianceMeter()
 
     for i in range(1000):
+        # Update statistics trackers.
         avg_meter.update(X[i])
         var_meter.update(X[i])
 
+    # Output resulting statistics.
     print(f"Average: {avg_meter.get()}, Standard deviation: {np.sqrt(var_meter.get())}")
     # It is close to random normal distribution with mean 0 and std 1 as we init the array via np.random.rand.
diff --git a/examples/example_usage.py b/examples/example_usage.py
@@ -1,3 +1,4 @@
+# Import modules.
 from sklearn.utils import shuffle
 from pysad.evaluation import AUROCMetric
 from pysad.models import xStream
@@ -8,28 +9,29 @@
 from tqdm import tqdm
 import numpy as np
 
-# This example demonstrates the usage of the most modules in pysad framework.
+# This example demonstrates the usage of the most modules in PySAD framework.
 if __name__ == "__main__":
-    np.random.seed(61)
-    data = Data("data")
+    np.random.seed(61)  # Fix random seed.
 
+    # Get data to stream.
+    data = Data("data")
     X_all, y_all = data.get_data("arrhythmia.mat")
     X_all, y_all = shuffle(X_all, y_all)
 
-    iterator = ArrayStreamer(shuffle=False)
-    model = xStream()
-    preprocessor = InstanceUnitNormScaler()
-    postprocessor = RunningAveragePostprocessor(window_size=5)
-    auroc = AUROCMetric()
+    iterator = ArrayStreamer(shuffle=False)  # Init streamer to simulate streaming data.
+
+    model = xStream()  # Init xStream anomaly detection model.
+    preprocessor = InstanceUnitNormScaler()  # Init normalizer.
+    postprocessor = RunningAveragePostprocessor(window_size=5)  # Init running average postprocessor.
+    auroc = AUROCMetric()  # Init area under receiver-operating- characteristics curve metric.
 
-    y_pred = []
-    for X, y in tqdm(iterator.iter(X_all[100:], y_all[100:])):
-        X = preprocessor.fit_transform_partial(X)
+    for X, y in tqdm(iterator.iter(X_all[100:], y_all[100:])):  # Stream data.
+        X = preprocessor.fit_transform_partial(X)  # Fit preprocessor to and transform the instance.
 
-        score = model.fit_score_partial(X)
-        score = postprocessor.fit_transform_partial(score)
+        score = model.fit_score_partial(X)  # Fit model to and score the instance.
+        score = postprocessor.fit_transform_partial(score)  # Apply running averaging to the score.
 
-        y_pred.append(score)
-        auroc.update(y, score)
+        auroc.update(y, score)  # Update AUROC metric.
 
+    # Output resulting AUROCS metric.
     print("AUROC: ", auroc.get())
diff --git a/examples/example_usage_short.py b/examples/example_usage_short.py
@@ -1,14 +1,16 @@
+# Import modules.
 from pysad.evaluation import AUROCMetric
 from pysad.models import LODA
 from pysad.utils import Data
 
-model = LODA()
-metric = AUROCMetric()
-streaming_data = Data().get_iterator("arrhythmia.mat")
+model = LODA()  # Init model
+metric = AUROCMetric()  # Init area under receiver-operating- characteristics curve metric
+streaming_data = Data().get_iterator("arrhythmia.mat") # Get data streamer.
 
-for x, y_true in streaming_data:
-    anomaly_score = model.fit_score_partial(x)
+for x, y_true in streaming_data:  # Stream data.
+    anomaly_score = model.fit_score_partial(x)  # Fit the instance to model and score the instance.
 
-    metric.update(y_true, anomaly_score)
+    metric.update(y_true, anomaly_score)  # Update the AUROC metric.
 
+# Output the resulting AUROCMetric.
 print(f"Area under ROC metric is {metric.get()}.")
diff --git a/pysad/core/base_metric.py b/pysad/core/base_metric.py
@@ -14,7 +14,7 @@ def update(self, y_true, y_pred):
 
         Args:
             y_true (int): Ground truth class. Either 1 or 0.
-            y_pred (int): Predicted class. Either 1 or 0.
+            y_pred (float): Predicted class or anomaly score. Higher values correspond to more anomalousness and lower values correspond to more normalness.
         """
         pass
 

diff --git a/pysad/evaluation/metrics.py b/pysad/evaluation/metrics.py
@@ -16,7 +16,7 @@ def update(self, y_true, y_pred):
 
         Args:
             y_true (int): Ground truth class. Either 1 or 0.
-            y_pred (int): Predicted class. Either 1 or 0.
+            y_pred (float): Predicted class or anomaly score. Higher values correspond to more anomalousness and lower values correspond to more normalness.
         """
         self.y_true.append(y_true)
         self.y_pred.append(y_pred)
@@ -37,7 +37,7 @@ def _evaluate(self, y_true, y_pred):
 
         Args:
             y_true (list[int]): Ground truth classes.
-            y_pred (list[int]): Predicted classes.
+            y_pred (list[float]): Predicted classes or scores.
         """
         pass
-Original file line number
+Diff line change
@@ Expand Up / @@ -14,7 +14,7 @@ def update(self, y_true, y_pred): @@
             Args:
                 y_true (int): Ground truth class. Either 1 or 0.
-                y_pred (int): Predicted class. Either 1 or 0.
+                y_pred (float): Predicted class or anomaly score. Higher values correspond to more anomalousness and lower values correspond to more normalness.
             """
             pass
@@ Expand Down @@