Skip to content

Commit

Permalink
adding a test for subnormal (underflow) values for AWS FeatureStore
Browse files Browse the repository at this point in the history
  • Loading branch information
brifordwylie committed Dec 3, 2024
1 parent 3683294 commit 88f51a6
Show file tree
Hide file tree
Showing 2 changed files with 76 additions and 0 deletions.
76 changes: 76 additions & 0 deletions tests/artifacts/featureset_ingest_boundaries.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
"""Tests for the FeatureSet Ingest Boundaries (under/overflow, NaN, Inf)"""

import pytest
import pandas as pd
from sageworks.api import FeatureSet
from sageworks.core.transforms.pandas_transforms import PandasToFeatures


# Valid subnormal test
def test_subnormals(subnormals):
"""Test IEEE 754 subnormal numbers"""

# Check if the values are valid subnormals
for val in subnormals:
print(f"Value: {val}")
if val == 0:
print(" Invalid: Represents zero, not subnormal.")
elif val < 4.94e-324 or val >= 2.225e-308:
print(" Invalid: Out of subnormal range.")
else:
print(" Valid: IEEE 754 subnormal.")


@pytest.mark.long
def test_underflow():
"""Underflow Analysis:
First 5 rows: Above Subnormal Space:
- Values just above the smallest positive normal number (2.225 x 10^-308).
- These are fully representable in `float64` with normal precision.
Last 5 rows: Within Subnormal Space:
- Values between the smallest positive normal number (2.225 x 10^-308)
and the smallest positive ^subnormal^ number (4.94 x 10^-324)
- These are representable but with reduced precision.
"""
above_subnormal = [2.3e-308, 5e-308, 1e-307, 2e-307, 2.22e-308]
within_subnormal = [1e-323, 5e-323, 1.5e-323, 2e-323, 4.94e-324]

# We're going to test that are subnormals are really subnormals
test_subnormals(within_subnormal)

# Create a test DataFrame with above_subnormal and within_subnormal values
data = {
"feature1": [42] * 10, # Control variable :)
"underflow_feature": above_subnormal + within_subnormal,
"id": list(range(1, 11)),
}
test_df = pd.DataFrame(data)
print("Test DataFrame:")
print(test_df)
test_df = pd.DataFrame(data)
print(test_df)

# Transform and ingest the dataframe using PandasToFeatures
feature_set_name = "test_underflow"
to_features = PandasToFeatures(feature_set_name)
to_features.set_output_tags(["test", "underflow"])
to_features.set_input(test_df, id_column="id")
to_features.transform()

# Pull the transformed data from the FeatureSet and verify
fs = FeatureSet(feature_set_name)
fs_df = fs.pull_dataframe()
fs_df = fs_df.sort_values(by="id").reset_index(drop=True) # Sort by ids
print("FeatureSet DataFrame:")
print(fs_df)

# Step 4: Check for dropped rows
original_ids = set(test_df["id"])
ingested_ids = set(fs_df["id"])
rejected_ids = original_ids - ingested_ids
print(f"Rejected IDs (due to underflow or ingest errors): {rejected_ids}")


if __name__ == "__main__":
test_underflow()
File renamed without changes.

0 comments on commit 88f51a6

Please sign in to comment.