From 88f51a6a8b869791160f4f86f95d5403e1f21ba1 Mon Sep 17 00:00:00 2001
From: Brian Wylie <briford.wylie@gmail.com>
Date: Tue, 3 Dec 2024 14:39:10 -0700
Subject: [PATCH] adding a test for subnormal (underflow) values for AWS
 FeatureStore

---
 .../artifacts/featureset_ingest_boundaries.py | 76 +++++++++++++++++++
 ...ature_set_tests.py => featureset_tests.py} |  0
 2 files changed, 76 insertions(+)
 create mode 100644 tests/artifacts/featureset_ingest_boundaries.py
 rename tests/artifacts/{feature_set_tests.py => featureset_tests.py} (100%)

diff --git a/tests/artifacts/featureset_ingest_boundaries.py b/tests/artifacts/featureset_ingest_boundaries.py
new file mode 100644
index 000000000..7be8b2495
--- /dev/null
+++ b/tests/artifacts/featureset_ingest_boundaries.py
@@ -0,0 +1,76 @@
+"""Tests for the FeatureSet Ingest Boundaries (under/overflow, NaN, Inf)"""
+
+import pytest
+import pandas as pd
+from sageworks.api import FeatureSet
+from sageworks.core.transforms.pandas_transforms import PandasToFeatures
+
+
+# Valid subnormal test
+def test_subnormals(subnormals):
+    """Test IEEE 754 subnormal numbers"""
+
+    # Check if the values are valid subnormals
+    for val in subnormals:
+        print(f"Value: {val}")
+        if val == 0:
+            print("  Invalid: Represents zero, not subnormal.")
+        elif val < 4.94e-324 or val >= 2.225e-308:
+            print("  Invalid: Out of subnormal range.")
+        else:
+            print("  Valid: IEEE 754 subnormal.")
+
+
+@pytest.mark.long
+def test_underflow():
+    """Underflow Analysis:
+
+    First 5 rows: Above Subnormal Space:
+       - Values just above the smallest positive normal number (2.225 x 10^-308).
+       - These are fully representable in `float64` with normal precision.
+    Last 5 rows: Within Subnormal Space:
+       - Values between the smallest positive normal number (2.225 x 10^-308)
+         and the smallest positive ^subnormal^ number (4.94 x 10^-324)
+       - These are representable but with reduced precision.
+    """
+    above_subnormal = [2.3e-308, 5e-308, 1e-307, 2e-307, 2.22e-308]
+    within_subnormal = [1e-323, 5e-323, 1.5e-323, 2e-323, 4.94e-324]
+
+    # We're going to test that are subnormals are really subnormals
+    test_subnormals(within_subnormal)
+
+    # Create a test DataFrame with above_subnormal and within_subnormal values
+    data = {
+        "feature1": [42] * 10,  # Control variable :)
+        "underflow_feature": above_subnormal + within_subnormal,
+        "id": list(range(1, 11)),
+    }
+    test_df = pd.DataFrame(data)
+    print("Test DataFrame:")
+    print(test_df)
+    test_df = pd.DataFrame(data)
+    print(test_df)
+
+    # Transform and ingest the dataframe using PandasToFeatures
+    feature_set_name = "test_underflow"
+    to_features = PandasToFeatures(feature_set_name)
+    to_features.set_output_tags(["test", "underflow"])
+    to_features.set_input(test_df, id_column="id")
+    to_features.transform()
+
+    # Pull the transformed data from the FeatureSet and verify
+    fs = FeatureSet(feature_set_name)
+    fs_df = fs.pull_dataframe()
+    fs_df = fs_df.sort_values(by="id").reset_index(drop=True)  # Sort by ids
+    print("FeatureSet DataFrame:")
+    print(fs_df)
+
+    # Step 4: Check for dropped rows
+    original_ids = set(test_df["id"])
+    ingested_ids = set(fs_df["id"])
+    rejected_ids = original_ids - ingested_ids
+    print(f"Rejected IDs (due to underflow or ingest errors): {rejected_ids}")
+
+
+if __name__ == "__main__":
+    test_underflow()
diff --git a/tests/artifacts/feature_set_tests.py b/tests/artifacts/featureset_tests.py
similarity index 100%
rename from tests/artifacts/feature_set_tests.py
rename to tests/artifacts/featureset_tests.py