Skip to content

Commit

Permalink
Fixed null check and unittest outputter.
Browse files Browse the repository at this point in the history
Fixed null check expecting literal 'NULL' instead of a null value.
Fixed unittest outputter to not ignore tests in new format of demension followed by generic test case name.
Added new test case to ensure all check configs load without erroring out.

Signed-off-by: Varun Mittal <[email protected]>
  • Loading branch information
varunmittal91 committed Dec 18, 2023
1 parent 187d745 commit 41865fd
Show file tree
Hide file tree
Showing 13 changed files with 112 additions and 84 deletions.
5 changes: 5 additions & 0 deletions focus_validator/config_objects/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,3 +68,8 @@ def generate_check_friendly_name(check, column_id):
return f"{column_id} does not allow null values."
elif isinstance(check, DataTypeCheck):
return f"{column_id} requires values of type {check.data_type.value}."
elif isinstance(check, SQLQueryCheck):
sql_query = " ".join([word.strip() for word in check.sql_query.split()])
return f"{column_id} requires values that return true when evaluated by the following SQL query: {sql_query}"
else:
raise NotImplementedError(f"Check {check} not implemented.")
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ def __generate_pandera_check__(rule: Rule, check_id):
)
elif isinstance(check, AllowNullsCheck):
return pa.Check.check_not_null(
error=error_string, ignore_na=False, allow_nulls=check.allow_nulls
error=error_string, ignore_na=check.allow_nulls
)
else:
raise FocusNotImplementedError(
Expand Down
58 changes: 31 additions & 27 deletions focus_validator/config_objects/rule.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
import os
from typing import Optional, Union
from typing import Annotated, Optional, Union

import yaml
from pydantic import BaseModel, ConfigDict, model_validator
from pydantic import BaseModel, ConfigDict, Field, field_validator
from pydantic_core.core_schema import ValidationInfo

from focus_validator.config_objects.common import (
SIMPLE_CHECKS,
Expand Down Expand Up @@ -33,44 +34,47 @@ class Rule(BaseModel):
SIMPLE_CHECKS, AllowNullsCheck, ValueInCheck, DataTypeCheck, SQLQueryCheck
]

check_friendly_name: Optional[
str
check_friendly_name: Annotated[
Optional[str], Field(validate_default=True)
] = None # auto generated or else can be overwritten
check_type_friendly_name: Optional[str] = None
check_type_friendly_name: Annotated[
Optional[str], Field(validate_default=True)
] = None

model_config = ConfigDict(
extra="forbid", # prevents config from containing any undesirable keys
frozen=True, # prevents any modification to any attribute onces loaded from config
)

# @root_validator
@model_validator(mode="before")
@classmethod
def root_val(cls, values):
"""
Root validator that checks for all options passed in the config and generate missing options.
"""
if values is None:
values = {}

check = values.get("check")
check_friendly_name = values.get("check_friendly_name")
column_id = values.get("column_id")
if check is not None:
@field_validator("check_friendly_name")
def validate_or_generate_check_friendly_name(
cls, check_friendly_name, validation_info: ValidationInfo
):
values = validation_info.data
if (
check_friendly_name is None
and values.get("check") is not None
and values.get("column_id") is not None
):
check_friendly_name = generate_check_friendly_name(
check=values["check"], column_id=values["column_id"]
)
return check_friendly_name

@field_validator("check_type_friendly_name")
def validate_or_generate_check_type_friendly_name(
cls, check_type_friendly_name, validation_info: ValidationInfo
):
values = validation_info.data
if values.get("check") is not None and values.get("column_id") is not None:
check = values.get("check")
if isinstance(check, str):
check_type_friendly_name = "".join(
[word.title() for word in check.split("_")]
)
else:
check_type_friendly_name = check.__class__.__name__
values["check_type_friendly_name"] = check_type_friendly_name

if check_friendly_name is None and column_id is not None:
values["check_friendly_name"] = generate_check_friendly_name(
check=check, column_id=column_id
)

return values
return check_type_friendly_name

@staticmethod
def load_yaml(
Expand Down
2 changes: 1 addition & 1 deletion focus_validator/data_loaders/csv_data_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,4 +6,4 @@ def __init__(self, data_filename):
self.data_filename = data_filename

def load(self):
return pd.read_csv(self.data_filename, keep_default_na=False)
return pd.read_csv(self.data_filename)
5 changes: 2 additions & 3 deletions focus_validator/main.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,7 @@
import argparse
import os
import sys

from focus_validator.validator import Validator
from focus_validator.validator import Validator, DEFAULT_VERSION_SETS_PATH


def main():
Expand Down Expand Up @@ -37,7 +36,7 @@ def main():
)
parser.add_argument(
"--rule-set-path",
default=os.path.join("focus_validator", "rules", "version_sets"),
default=DEFAULT_VERSION_SETS_PATH,
help="Path to rules definitions",
)
parser.add_argument(
Expand Down
6 changes: 3 additions & 3 deletions focus_validator/outputter/outputter_unittest.py
Original file line number Diff line number Diff line change
Expand Up @@ -146,9 +146,9 @@ def write(self, result_set):

# Add the testcases to the testsuites
added_testsuites = {}
for testcase in [
r for r in rows if re.match(r"^FV-[D,M][0-9]{3}-[0-9]{4}$", r["check_name"])
]:
for testcase in rows:
if testcase["status"].value == "errored":
continue
test_suite_id = testcase["check_name"].rsplit("-", 1)[0]
if test_suite_id not in added_testsuites:
formatter.add_testsuite(
Expand Down
7 changes: 2 additions & 5 deletions focus_validator/rules/checks.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,11 +21,8 @@ def is_camel_case(column_name):


@extensions.register_check_method()
def check_not_null(pandas_obj: pd.Series, allow_nulls: bool):
# TODO: works for string type, need to verify for other data types
check_values = pandas_obj.isnull() | (pandas_obj == "")
if not allow_nulls:
check_values = check_values | (pandas_obj == "NULL")
def check_not_null(pandas_obj: pd.Series):
check_values = pandas_obj.isnull()
return ~check_values


Expand Down
42 changes: 10 additions & 32 deletions tests/checks/test_null_value_check.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ def test_null_value_not_allowed_invalid_case(self):
allow_nulls=False, data_type=DataTypes.STRING
)
sample_data = pd.DataFrame(
[{"test_dimension": "NULL"}, {"test_dimension": "val2"}]
[{"test_dimension": None}, {"test_dimension": "val2"}]
)
schema, checklist = FocusToPanderaSchemaConverter.generate_pandera_schema(
rules=rules, override_config=None
Expand All @@ -104,12 +104,14 @@ def test_null_value_not_allowed_invalid_case(self):
"Column": "test_dimension",
"Check Name": "allow_null",
"Description": " test_dimension does not allow null values.",
"Values": "NULL",
"Values": None,
"Row #": 1,
},
)

def test_null_value_allowed_invalid_case_with_empty_strings(self):
def test_null_value_allowed_valid_case_with_empty_strings(self):
# ensure that check does not treat empty strings as null values

rules = self.__generate_sample_rule_type_string__(
allow_nulls=True, data_type=DataTypes.STRING
)
Expand All @@ -123,23 +125,11 @@ def test_null_value_allowed_invalid_case_with_empty_strings(self):
)
self.assertEqual(
validation_result.checklist["allow_null"].status,
ChecklistObjectStatus.FAILED,
)
self.assertIsNotNone(validation_result.failure_cases)
failure_cases_dict = validation_result.failure_cases.to_dict(orient="records")
self.assertEqual(len(failure_cases_dict), 1)
self.assertEqual(
failure_cases_dict[0],
{
"Column": "test_dimension",
"Check Name": "allow_null",
"Description": " test_dimension allows null values.",
"Values": "",
"Row #": 2,
},
ChecklistObjectStatus.PASSED,
)
self.assertIsNone(validation_result.failure_cases)

def test_null_value_allowed_invalid_case_with_nan_values(self):
def test_null_value_allowed_case_with_explicit_null_values(self):
rules = self.__generate_sample_rule_type_string__(
allow_nulls=True, data_type=DataTypes.STRING
)
Expand All @@ -155,18 +145,6 @@ def test_null_value_allowed_invalid_case_with_nan_values(self):
)
self.assertEqual(
validation_result.checklist["allow_null"].status,
ChecklistObjectStatus.FAILED,
)
self.assertIsNotNone(validation_result.failure_cases)
failure_cases_dict = validation_result.failure_cases.to_dict(orient="records")
self.assertEqual(len(failure_cases_dict), 1)
self.assertEqual(
failure_cases_dict[0],
{
"Column": "test_dimension",
"Check Name": "allow_null",
"Description": " test_dimension allows null values.",
"Values": None,
"Row #": 2,
},
ChecklistObjectStatus.PASSED,
)
self.assertIsNone(validation_result.failure_cases)
5 changes: 3 additions & 2 deletions tests/checks/test_sql_query_check.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,20 +105,21 @@ def test_null_value_allowed_valid_case(self):
failure_cases_dict = validation_result.failure_cases.to_dict(orient="records")

self.assertEqual(len(failure_cases_dict), 2)
print(json.dumps(failure_cases_dict, indent=4))
self.assertEqual(
failure_cases_dict,
[
{
"Column": "test_dimension",
"Check Name": "sql_check_for_multiple_columns",
"Description": " None",
"Description": " test_dimension requires values that return true when evaluated by the following SQL query: SELECT test_dimension, CASE WHEN test_dimension = 'some-value' THEN true ELSE false END AS check_output FROM df;",
"Values": "test_dimension:NULL,test_dimension:NULL",
"Row #": 1,
},
{
"Column": "test_dimension",
"Check Name": "sql_check_for_multiple_columns",
"Description": " None",
"Description": " test_dimension requires values that return true when evaluated by the following SQL query: SELECT test_dimension, CASE WHEN test_dimension = 'some-value' THEN true ELSE false END AS check_output FROM df;",
"Values": "test_dimension:NULL,test_dimension:NULL",
"Row #": 4,
},
Expand Down
4 changes: 4 additions & 0 deletions tests/config_objects/test_check_type_friendly_name.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,11 +49,15 @@ def test_random_value_is_ignored(self):
self.assertEqual(sample.check_type_friendly_name, "CheckUnique")

def test_data_type_config(self):
# Ensures that the check_type_friendly_name is generated correctly for DataTypeCheck

model_factory = ModelFactory.create_factory(model=Rule)

# generate random rule object
sample_data_type = model_factory.build(
**{"check": DataTypeCheck(data_type=DataTypes.STRING)}
)

self.assertEqual(sample_data_type.check_type_friendly_name, "DataTypeCheck")

def test_check_type_config_deny_update(self):
Expand Down
27 changes: 19 additions & 8 deletions tests/config_objects/test_load_bad_rule_config_file.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,20 +50,29 @@ def test_load_schema(self):
)

self.assertEqual(
checklist["bad_rule_config_empty_file"].status, ChecklistObjectStatus.ERRORED
checklist["bad_rule_config_empty_file"].status,
ChecklistObjectStatus.ERRORED,
)

self.assertEqual(checklist["valid_rule_config_column_metadata"].column_id, "ChargeType")
self.assertEqual(checklist["valid_rule_config_column_metadata"].status, ChecklistObjectStatus.PENDING)
self.assertEqual(
checklist["valid_rule_config_column_metadata"].column_id, "ChargeType"
)
self.assertEqual(
checklist["valid_rule_config_column_metadata"].status,
ChecklistObjectStatus.PENDING,
)
self.assertIsNone(checklist["valid_rule_config_column_metadata"].error)
self.assertIsNotNone(checklist["valid_rule_config_column_metadata"].friendly_name)
self.assertIsNotNone(
checklist["valid_rule_config_column_metadata"].friendly_name
)
self.assertEqual(
checklist["valid_rule_config_column_metadata"].friendly_name, "Ensures that column is of string type."
checklist["valid_rule_config_column_metadata"].friendly_name,
"Ensures that column is of string type.",
)

for errored_checks in [
'bad_rule_config_empty_file',
'bad_rule_config_missing_check'
"bad_rule_config_empty_file",
"bad_rule_config_missing_check",
]:
self.assertEqual(
checklist[errored_checks].status, ChecklistObjectStatus.ERRORED
Expand All @@ -87,8 +96,10 @@ def test_load_schema_without_valid_column_metadata(self):
rules=rules, override_config=None
)
self.assertEqual(
checklist["bad_rule_config_missing_check"].status, ChecklistObjectStatus.ERRORED
checklist["bad_rule_config_missing_check"].status,
ChecklistObjectStatus.ERRORED,
)
print(checklist["bad_rule_config_missing_check"].error)
self.assertRegex(
checklist["bad_rule_config_missing_check"].error,
"ValidationError:.*",
Expand Down
4 changes: 2 additions & 2 deletions tests/data_loaders/test_null_value_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ def test_null_value_from_csv(self):
loader = CSVDataLoader(buffer)
data = loader.load()

self.assertEqual(data.to_dict(orient="records")[0], {"value": "NULL"})
self.assertTrue(pd.isnull(data.to_dict(orient="records")[0]["value"]))

def test_null_value_from_csv_with_missing_value(self):
sample_data = pd.DataFrame([{"value": None}])
Expand All @@ -36,7 +36,7 @@ def test_null_value_from_csv_with_missing_value(self):
loader = CSVDataLoader(buffer)
data = loader.load()

self.assertEqual(data.to_dict(orient="records")[0], {"value": ""})
self.assertTrue(pd.isnull(data.to_dict(orient="records")[0]["value"]))

def test_null_value_from_parquet(self):
sample_data = pd.DataFrame([{"value": "NULL"}])
Expand Down
29 changes: 29 additions & 0 deletions tests/test_rule_config_load.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
import os

import pytest

from focus_validator.config_objects import Rule
from focus_validator.rules.spec_rules import SpecRules
from focus_validator.validator import DEFAULT_VERSION_SETS_PATH


def rules_version():
return sorted([x for x in os.walk(DEFAULT_VERSION_SETS_PATH)][0][1])


@pytest.mark.parametrize("focus_spec_version", rules_version())
def test_rules_load_with_no_errors(focus_spec_version):
"""
Test loading of rules with no errors
"""
spec_rules = SpecRules(
override_filename=None,
rule_set_path=DEFAULT_VERSION_SETS_PATH,
rules_version=focus_spec_version,
column_namespace=None,
)
spec_rules.load()

for rule in spec_rules.rules:
# ensures that the rule is a Rule object and not InvalidRule
assert isinstance(rule, Rule)

0 comments on commit 41865fd

Please sign in to comment.