Fixed import path for default version sets, output of unittest output…

…ter and null checks, deprecates support for python 3.9. (#107) Signed-off-by: Varun Mittal <[email protected]>
finopsfoundation · Dec 19, 2023 · d211632 · d211632
1 parent 187d745
commit d211632
Show file tree

Hide file tree

Showing 20 changed files with 128 additions and 110 deletions.
diff --git a/.github/workflows/coverage.yaml b/.github/workflows/coverage.yaml
@@ -1,5 +1,4 @@
 name: Coverage
-
 on:
   push:
     branches:
@@ -10,14 +9,15 @@ on:
     branches:
       - main
       - dev
-
 jobs:
   coverage:
     runs-on: ubuntu-latest
     steps:
       - uses: actions/checkout@v3
-      - name: Set up Python ${{ matrix.python-version }}
+      - name: Set up Python 3.9
         uses: actions/setup-python@v4
+        with:
+          python-version: 3.9
       - name: Install poetry
         uses: abatilo/actions-poetry@v2
       - name: Setup a local virtual environment

diff --git a/.github/workflows/lint.yaml b/.github/workflows/lint.yaml
@@ -1,19 +1,19 @@
 name: Lint
-
 on:
   push:
   pull_request:
     branches:
       - main
       - dev
-
 jobs:
   lint:
     runs-on: ubuntu-latest
     steps:
       - uses: actions/checkout@v3
-      - name: Set up Python ${{ matrix.python-version }}
+      - name: Set up Python 3.9
         uses: actions/setup-python@v4
+        with:
+          python-version: 3.9
       - name: Install poetry
         uses: abatilo/actions-poetry@v2
       - name: Setup a local virtual environment

diff --git a/.github/workflows/main.yaml b/.github/workflows/main.yaml
@@ -5,7 +5,6 @@ on:
       - main
       - dev
       - issue/**
-
 jobs:
   validate_focus:
     runs-on: ubuntu-latest
@@ -14,8 +13,10 @@ jobs:
     steps:
       - name: Check out repository code
         uses: actions/checkout@v3
-      - name: Set up Python ${{ matrix.python-version }}
+      - name: Set up Python 3.9
         uses: actions/setup-python@v4
+        with:
+          python-version: 3.9
       - name: Install poetry
         uses: abatilo/actions-poetry@v2
       - name: Setup a local virtual environment

diff --git a/.github/workflows/publish.yaml b/.github/workflows/publish.yaml
@@ -4,23 +4,22 @@ on:
     tags:
       - 'v\d\.\d\.\d'
       - 'v\d\.\d\.\d-(dev|rc)\d'
-
 jobs:
   publish:
     permissions:
       id-token: write
     runs-on: ubuntu-latest
     steps:
       - uses: actions/checkout@v3
-      - name: Set up Python 3.8
+      - name: Set up Python 3.11
         uses: actions/setup-python@v4
         with:
-          python-version: 3.8
+          python-version: 3.11
       - name: Install poetry
         uses: abatilo/actions-poetry@v2
       - name: Install dependencies
         run: |
           find -type l -exec bash -c 'ln -f "$(readlink -m "$0")" "$0"' {} \;
-          poetry build
+          poetry build --format=sdist
       - name: Publish package distributions to PyPI
         uses: pypa/gh-action-pypi-publish@release/v1
diff --git a/.github/workflows/unittest.yaml b/.github/workflows/unittest.yaml
@@ -1,5 +1,4 @@
 name: Unittest
-
 on:
   push:
     branches:
@@ -10,14 +9,12 @@ on:
     branches:
       - main
       - dev
-
 jobs:
   test:
     runs-on: ${{ matrix.os }}
-
     strategy:
       matrix:
-        python-version: [ "3.8", "3.9", "3.10", "3.11", "3.12" ]
+        python-version: [ "3.9", "3.10", "3.11", "3.12" ]
         os: [ windows-latest, ubuntu-latest, macos-latest ]
     steps:
       - uses: actions/checkout@v3

diff --git a/README.md b/README.md
@@ -10,7 +10,7 @@ tbd
 
 ### Prerequisites
 
-- Python 3.8+
+- Python 3.9+
 - Poetry (Package & Dependency Manager)
 
 ### Installation

diff --git a/focus_validator/config_objects/common.py b/focus_validator/config_objects/common.py
@@ -68,3 +68,8 @@ def generate_check_friendly_name(check, column_id):
             return f"{column_id} does not allow null values."
     elif isinstance(check, DataTypeCheck):
         return f"{column_id} requires values of type {check.data_type.value}."
+    elif isinstance(check, SQLQueryCheck):
+        sql_query = " ".join([word.strip() for word in check.sql_query.split()])
+        return f"{column_id} requires values that return true when evaluated by the following SQL query: {sql_query}"
+    else:
+        raise NotImplementedError(f"Check {check} not implemented.")
diff --git a/focus_validator/config_objects/focus_to_pandera_schema_converter.py b/focus_validator/config_objects/focus_to_pandera_schema_converter.py
@@ -72,7 +72,7 @@ def __generate_pandera_check__(rule: Rule, check_id):
             )
         elif isinstance(check, AllowNullsCheck):
             return pa.Check.check_not_null(
-                error=error_string, ignore_na=False, allow_nulls=check.allow_nulls
+                error=error_string, ignore_na=check.allow_nulls
             )
         else:
             raise FocusNotImplementedError(

diff --git a/focus_validator/config_objects/rule.py b/focus_validator/config_objects/rule.py
@@ -1,8 +1,9 @@
 import os
-from typing import Optional, Union
+from typing import Annotated, Optional, Union
 
 import yaml
-from pydantic import BaseModel, ConfigDict, model_validator
+from pydantic import BaseModel, ConfigDict, Field, field_validator
+from pydantic_core.core_schema import ValidationInfo
 
 from focus_validator.config_objects.common import (
     SIMPLE_CHECKS,
@@ -33,44 +34,47 @@ class Rule(BaseModel):
         SIMPLE_CHECKS, AllowNullsCheck, ValueInCheck, DataTypeCheck, SQLQueryCheck
     ]
 
-    check_friendly_name: Optional[
-        str
+    check_friendly_name: Annotated[
+        Optional[str], Field(validate_default=True)
     ] = None  # auto generated or else can be overwritten
-    check_type_friendly_name: Optional[str] = None
+    check_type_friendly_name: Annotated[
+        Optional[str], Field(validate_default=True)
+    ] = None
 
     model_config = ConfigDict(
         extra="forbid",  # prevents config from containing any undesirable keys
         frozen=True,  # prevents any modification to any attribute onces loaded from config
     )
 
-    # @root_validator
-    @model_validator(mode="before")
-    @classmethod
-    def root_val(cls, values):
-        """
-        Root validator that checks for all options passed in the config and generate missing options.
-        """
-        if values is None:
-            values = {}
-
-        check = values.get("check")
-        check_friendly_name = values.get("check_friendly_name")
-        column_id = values.get("column_id")
-        if check is not None:
+    @field_validator("check_friendly_name")
+    def validate_or_generate_check_friendly_name(
+        cls, check_friendly_name, validation_info: ValidationInfo
+    ):
+        values = validation_info.data
+        if (
+            check_friendly_name is None
+            and values.get("check") is not None
+            and values.get("column_id") is not None
+        ):
+            check_friendly_name = generate_check_friendly_name(
+                check=values["check"], column_id=values["column_id"]
+            )
+        return check_friendly_name
+
+    @field_validator("check_type_friendly_name")
+    def validate_or_generate_check_type_friendly_name(
+        cls, check_type_friendly_name, validation_info: ValidationInfo
+    ):
+        values = validation_info.data
+        if values.get("check") is not None and values.get("column_id") is not None:
+            check = values.get("check")
             if isinstance(check, str):
                 check_type_friendly_name = "".join(
                     [word.title() for word in check.split("_")]
                 )
             else:
                 check_type_friendly_name = check.__class__.__name__
-            values["check_type_friendly_name"] = check_type_friendly_name
-
-        if check_friendly_name is None and column_id is not None:
-            values["check_friendly_name"] = generate_check_friendly_name(
-                check=check, column_id=column_id
-            )
-
-        return values
+        return check_type_friendly_name
 
     @staticmethod
     def load_yaml(

diff --git a/focus_validator/data_loaders/csv_data_loader.py b/focus_validator/data_loaders/csv_data_loader.py
@@ -6,4 +6,4 @@ def __init__(self, data_filename):
         self.data_filename = data_filename
 
     def load(self):
-        return pd.read_csv(self.data_filename, keep_default_na=False)
+        return pd.read_csv(self.data_filename)
diff --git a/focus_validator/main.py b/focus_validator/main.py
@@ -1,8 +1,7 @@
 import argparse
-import os
 import sys
 
-from focus_validator.validator import Validator
+from focus_validator.validator import DEFAULT_VERSION_SETS_PATH, Validator
 
 
 def main():
@@ -37,7 +36,7 @@ def main():
     )
     parser.add_argument(
         "--rule-set-path",
-        default=os.path.join("focus_validator", "rules", "version_sets"),
+        default=DEFAULT_VERSION_SETS_PATH,
         help="Path to rules definitions",
     )
     parser.add_argument(

diff --git a/focus_validator/outputter/outputter_unittest.py b/focus_validator/outputter/outputter_unittest.py
@@ -1,5 +1,4 @@
 import logging
-import re
 import sys
 import xml.etree.cElementTree as ET
 from datetime import datetime, timezone
@@ -146,9 +145,9 @@ def write(self, result_set):
 
         # Add the testcases to the testsuites
         added_testsuites = {}
-        for testcase in [
-            r for r in rows if re.match(r"^FV-[D,M][0-9]{3}-[0-9]{4}$", r["check_name"])
-        ]:
+        for testcase in rows:
+            if testcase["status"].value == "errored":
+                continue
             test_suite_id = testcase["check_name"].rsplit("-", 1)[0]
             if test_suite_id not in added_testsuites:
                 formatter.add_testsuite(

diff --git a/focus_validator/rules/checks.py b/focus_validator/rules/checks.py
@@ -21,11 +21,8 @@ def is_camel_case(column_name):
 
 
 @extensions.register_check_method()
-def check_not_null(pandas_obj: pd.Series, allow_nulls: bool):
-    # TODO: works for string type, need to verify for other data types
-    check_values = pandas_obj.isnull() | (pandas_obj == "")
-    if not allow_nulls:
-        check_values = check_values | (pandas_obj == "NULL")
+def check_not_null(pandas_obj: pd.Series):
+    check_values = pandas_obj.isnull()
     return ~check_values
 
 

diff --git a/pyproject.toml b/pyproject.toml
@@ -1,5 +1,5 @@
 [tool.poetry]
-name = "focus-spec-validator"
+name = "focus_validator"
 version = "0.5.2-dev2"
 description = "FOCUS spec validator."
 authors = []
@@ -19,7 +19,7 @@ generate-setup-file = false
 script = "build.py"
 
 [tool.poetry.dependencies]
-python = "^3.8.3"
+python = "^3.9"
 pandas = "^2"
 tabulate = "*"
 pyarrow = "*"
@@ -28,12 +28,7 @@ pyyaml = "*"
 requests = "*"
 pandera = { version = "^0.17.2" }
 sqlglot = "^18.7.0"
-
-# for Python 3.12, force higher version of numpy
-numpy = [
-    { version = "~1.24", python = "~3.8" },
-    { version = "~1.26", python = "~3.12" }
-]
+numpy = { version = "^1.26"}
 pytz = "^2023.3.post1"
 pandasql = "^0.7.3"
 

diff --git a/tests/checks/test_null_value_check.py b/tests/checks/test_null_value_check.py
@@ -83,7 +83,7 @@ def test_null_value_not_allowed_invalid_case(self):
             allow_nulls=False, data_type=DataTypes.STRING
         )
         sample_data = pd.DataFrame(
-            [{"test_dimension": "NULL"}, {"test_dimension": "val2"}]
+            [{"test_dimension": None}, {"test_dimension": "val2"}]
         )
         schema, checklist = FocusToPanderaSchemaConverter.generate_pandera_schema(
             rules=rules, override_config=None
@@ -104,12 +104,14 @@ def test_null_value_not_allowed_invalid_case(self):
                 "Column": "test_dimension",
                 "Check Name": "allow_null",
                 "Description": " test_dimension does not allow null values.",
-                "Values": "NULL",
+                "Values": None,
                 "Row #": 1,
             },
         )
 
-    def test_null_value_allowed_invalid_case_with_empty_strings(self):
+    def test_null_value_allowed_valid_case_with_empty_strings(self):
+        # ensure that check does not treat empty strings as null values
+
         rules = self.__generate_sample_rule_type_string__(
             allow_nulls=True, data_type=DataTypes.STRING
         )
@@ -123,23 +125,11 @@ def test_null_value_allowed_invalid_case_with_empty_strings(self):
         )
         self.assertEqual(
             validation_result.checklist["allow_null"].status,
-            ChecklistObjectStatus.FAILED,
-        )
-        self.assertIsNotNone(validation_result.failure_cases)
-        failure_cases_dict = validation_result.failure_cases.to_dict(orient="records")
-        self.assertEqual(len(failure_cases_dict), 1)
-        self.assertEqual(
-            failure_cases_dict[0],
-            {
-                "Column": "test_dimension",
-                "Check Name": "allow_null",
-                "Description": " test_dimension allows null values.",
-                "Values": "",
-                "Row #": 2,
-            },
+            ChecklistObjectStatus.PASSED,
         )
+        self.assertIsNone(validation_result.failure_cases)
 
-    def test_null_value_allowed_invalid_case_with_nan_values(self):
+    def test_null_value_allowed_case_with_explicit_null_values(self):
         rules = self.__generate_sample_rule_type_string__(
             allow_nulls=True, data_type=DataTypes.STRING
         )
@@ -155,18 +145,6 @@ def test_null_value_allowed_invalid_case_with_nan_values(self):
         )
         self.assertEqual(
             validation_result.checklist["allow_null"].status,
-            ChecklistObjectStatus.FAILED,
-        )
-        self.assertIsNotNone(validation_result.failure_cases)
-        failure_cases_dict = validation_result.failure_cases.to_dict(orient="records")
-        self.assertEqual(len(failure_cases_dict), 1)
-        self.assertEqual(
-            failure_cases_dict[0],
-            {
-                "Column": "test_dimension",
-                "Check Name": "allow_null",
-                "Description": " test_dimension allows null values.",
-                "Values": None,
-                "Row #": 2,
-            },
+            ChecklistObjectStatus.PASSED,
         )
+        self.assertIsNone(validation_result.failure_cases)