Formatted standard

canimus · Jul 14, 2024 · 390026f · 390026f
1 parent 9039b51
commit 390026f
Show file tree

Hide file tree

Showing 9 changed files with 201 additions and 47 deletions.
diff --git a/cuallee/__init__.py b/cuallee/__init__.py
@@ -136,7 +136,11 @@ def __post_init__(self):
             if len(Counter(map(type, self.value)).keys()) > 1:
                 raise ValueError("Data types in rule values are inconsistent")
 
-        if (self.options and isinstance(self.options, dict) and (rule_name := self.options.get("name"))):
+        if (
+            self.options
+            and isinstance(self.options, dict)
+            and (rule_name := self.options.get("name"))
+        ):
             self.name = rule_name
         else:
             self.name = self.method
@@ -393,7 +397,13 @@ def are_complete(self, column: Union[List[str], Tuple[str, str]], pct: float = 1
         Rule("are_complete", column, "N/A", CheckDataType.AGNOSTIC, pct) >> self._rule
         return self
 
-    def is_unique(self, column: str, pct: float = 1.0, approximate: bool = False, ignore_nulls: bool = False):
+    def is_unique(
+        self,
+        column: str,
+        pct: float = 1.0,
+        approximate: bool = False,
+        ignore_nulls: bool = False,
+    ):
         """
         Validation for unique values in column
 
@@ -424,7 +434,17 @@ def is_primary_key(self, column: str, pct: float = 1.0):
             column (str): Column name in dataframe
             pct (float): The threshold percentage required to pass
         """
-        Rule("is_unique", column, "N/A", CheckDataType.AGNOSTIC, pct, options={"name" : "is_primary_key"}) >> self._rule
+        (
+            Rule(
+                "is_unique",
+                column,
+                "N/A",
+                CheckDataType.AGNOSTIC,
+                pct,
+                options={"name": "is_primary_key"},
+            )
+            >> self._rule
+        )
         return self
 
     def are_unique(self, column: Union[List[str], Tuple[str, str]], pct: float = 1.0):
@@ -448,7 +468,17 @@ def is_composite_key(
             column (str): Column name in dataframe
             pct (float): The threshold percentage required to pass
         """
-        Rule("are_unique", column, "N/A", CheckDataType.AGNOSTIC, pct, options={"name" : "is_composite_key"}) >> self._rule
+        (
+            Rule(
+                "are_unique",
+                column,
+                "N/A",
+                CheckDataType.AGNOSTIC,
+                pct,
+                options={"name": "is_composite_key"},
+            )
+            >> self._rule
+        )
         return self
 
     def is_greater_than(self, column: str, value: float, pct: float = 1.0):
@@ -557,7 +587,9 @@ def is_equal_than(self, column: str, value: float, pct: float = 1.0):
         Rule("is_equal_than", column, value, CheckDataType.NUMERIC, pct) >> self._rule
         return self
 
-    def has_pattern(self, column: str, value: str, pct: float = 1.0, options: Dict[str, str] = {}):
+    def has_pattern(
+        self, column: str, value: str, pct: float = 1.0, options: Dict[str, str] = {}
+    ):
         """
         Validation for string type column matching regex expression
 
@@ -566,7 +598,12 @@ def has_pattern(self, column: str, value: str, pct: float = 1.0, options: Dict[s
             value (regex): A regular expression used to  match values in the `column`
             pct (float): The threshold percentage required to pass
         """
-        Rule("has_pattern", column, value, CheckDataType.STRING, pct, options=options) >> self._rule
+        (
+            Rule(
+                "has_pattern", column, value, CheckDataType.STRING, pct, options=options
+            )
+            >> self._rule
+        )
         return self
 
     def is_legit(self, column: str, pct: float = 1.0):
@@ -581,7 +618,17 @@ def is_legit(self, column: str, pct: float = 1.0):
             column (str): Column name in dataframe
             pct (float): The threshold percentage required to pass
         """
-        Rule("has_pattern", column, r"^\S+$", CheckDataType.STRING, pct, options={"name" : "is_legit"}) >> self._rule
+        (
+            Rule(
+                "has_pattern",
+                column,
+                r"^\S+$",
+                CheckDataType.STRING,
+                pct,
+                options={"name": "is_legit"},
+            )
+            >> self._rule
+        )
         return self
 
     def has_min(self, column: str, value: float):
@@ -688,7 +735,7 @@ def is_contained_in(
         column: str,
         value: Union[List, Tuple],
         pct: float = 1.0,
-        options : Dict[str, str] = {}
+        options: Dict[str, str] = {},
     ):
         """
         Validation of column value in set of given values
@@ -700,7 +747,14 @@ def is_contained_in(
         """
 
         (
-            Rule("is_contained_in", column, value, CheckDataType.AGNOSTIC, pct, options=options)
+            Rule(
+                "is_contained_in",
+                column,
+                value,
+                CheckDataType.AGNOSTIC,
+                pct,
+                options=options,
+            )
             >> self._rule
         )
 
@@ -717,7 +771,13 @@ def is_in(self, column: str, value: Tuple[str, int, float], pct: float = 1.0):
         """
         return self.is_contained_in(column, value, pct, options={"name": "is_in"})
 
-    def is_t_minus_n(self, column: str, value: int, pct: float = 1.0, options: Dict[str, str] = {"name": "is_t_minus_n"}):
+    def is_t_minus_n(
+        self,
+        column: str,
+        value: int,
+        pct: float = 1.0,
+        options: Dict[str, str] = {"name": "is_t_minus_n"},
+    ):
         """
         Validate that date is `n` days before the current date
 
@@ -727,7 +787,9 @@ def is_t_minus_n(self, column: str, value: int, pct: float = 1.0, options: Dict[
             pct (float): The threshold percentage required to pass
         """
         yesterday = datetime.utcnow() - timedelta(days=value)
-        return self.is_contained_in(column, tuple([yesterday.strftime("%Y-%m-%d")]), pct, options=options)
+        return self.is_contained_in(
+            column, tuple([yesterday.strftime("%Y-%m-%d")]), pct, options=options
+        )
 
     def is_t_minus_1(self, column: str, pct: float = 1.0):
         """
@@ -894,7 +956,13 @@ def has_correlation(self, column_left: str, column_right: str, value: float):
         )
         return self
 
-    def satisfies(self, column: str, predicate: str, pct: float = 1.0, options: Dict[str, str] = {}):
+    def satisfies(
+        self,
+        column: str,
+        predicate: str,
+        pct: float = 1.0,
+        options: Dict[str, str] = {},
+    ):
         """
         Validation of a column satisfying a SQL-like predicate
 
@@ -903,7 +971,17 @@ def satisfies(self, column: str, predicate: str, pct: float = 1.0, options: Dict
             predicate (str): A predicate written in SQL-like syntax
             pct (float): The threshold percentage required to pass
         """
-        Rule("satisfies", column, predicate, CheckDataType.AGNOSTIC, pct, options=options) >> self._rule
+        (
+            Rule(
+                "satisfies",
+                column,
+                predicate,
+                CheckDataType.AGNOSTIC,
+                pct,
+                options=options,
+            )
+            >> self._rule
+        )
         return self
 
     def has_cardinality(self, column: str, value: int):
@@ -1180,7 +1258,11 @@ def has_workflow(
         return self
 
     def is_custom(
-        self, column: Union[str, List[str]], fn: Callable = None, pct: float = 1.0, options: Dict[str, str] = {}
+        self,
+        column: Union[str, List[str]],
+        fn: Callable = None,
+        pct: float = 1.0,
+        options: Dict[str, str] = {},
     ):
         """
         Uses a user-defined function that receives the to-be-validated dataframe
@@ -1192,7 +1274,10 @@ def is_custom(
             pct (float): The threshold percentage required to pass
         """
 
-        (Rule("is_custom", column, fn, CheckDataType.AGNOSTIC, pct, options=options) >> self._rule)
+        (
+            Rule("is_custom", column, fn, CheckDataType.AGNOSTIC, pct, options=options)
+            >> self._rule
+        )
         return self
 
     def validate(self, dataframe: Any):

diff --git a/cuallee/bio/checks.py b/cuallee/bio/checks.py
@@ -13,21 +13,42 @@ def __init__(self, check: Check):
         except Exception:
             raise Exception("Unable to load aminoacid definitions")
 
-    def is_dna(self, column: str, pct: float = 1.0, options: Dict[str, str] = {"name" : "is_dna"}):
+    def is_dna(
+        self,
+        column: str,
+        pct: float = 1.0,
+        options: Dict[str, str] = {"name": "is_dna"},
+    ):
         """Validates that a sequence contains only valid nucleotide bases of DNA strand"""
         self._check.has_pattern(column, r"^[GTCA]*$", pct, options=options)
         return self._check
 
-    def is_protein(self, column: str, pct: float = 1.0, options: Dict[str, str] = {"name": "is_protein"}):
+    def is_protein(
+        self,
+        column: str,
+        pct: float = 1.0,
+        options: Dict[str, str] = {"name": "is_protein"},
+    ):
         """Verifies that a sequence contains only valid aminoacid 1-letter codes"""
         self._check.has_pattern(
-            column, rf"^[{''.join(self._aminoacids['1_letter_code'].tolist())}]*$", pct, options=options
+            column,
+            rf"^[{''.join(self._aminoacids['1_letter_code'].tolist())}]*$",
+            pct,
+            options=options,
         )
         return self._check
 
-    def is_cds(self, column: str, pct: float = 1.0, options: Dict[str, str] = {"name": "is_cds"}):
+    def is_cds(
+        self,
+        column: str,
+        pct: float = 1.0,
+        options: Dict[str, str] = {"name": "is_cds"},
+    ):
         """Verifies that a sequence contains the correct codons"""
         self._check.satisfies(
-            column, f"({column} rlike '^ATG.*') and ({column} rlike '.*(TAA|TAG|TGA)$') and (length({column}) % 3 == 0)", pct, options=options
+            column,
+            f"({column} rlike '^ATG.*') and ({column} rlike '.*(TAA|TAG|TGA)$') and (length({column}) % 3 == 0)",
+            pct,
+            options=options,
         )
         return self._check
diff --git a/cuallee/duckdb_validation.py b/cuallee/duckdb_validation.py
@@ -111,7 +111,7 @@ def satisfies(self, rule: Rule) -> str:
 
         # Compatibility with other dataframe regular expression comparissons
         expression = re.compile(re.escape("rlike"), re.IGNORECASE)
-        subquery = expression.sub('SIMILAR TO', rule.value)        
+        subquery = expression.sub("SIMILAR TO", rule.value)
         return f"SUM(CAST(({subquery}) AS INTEGER))"
 
     def has_entropy(self, rule: Rule) -> str:

diff --git a/cuallee/iso/checks.py b/cuallee/iso/checks.py
@@ -52,13 +52,23 @@ def __init__(self, check):
         self._ccy = []
         self._countries = []
 
-    def iso_4217(self, column: str, pct: float = 1.0, options: Dict[str,str]={"name": "iso_4217"}):
+    def iso_4217(
+        self,
+        column: str,
+        pct: float = 1.0,
+        options: Dict[str, str] = {"name": "iso_4217"},
+    ):
         """It verifies a field against the international standard currency codes via code or number fields from ISO 4217"""
         self._ccy = _load_currencies()
         self._check.is_contained_in(column, self._ccy, pct, options=options)
         return self._check
 
-    def iso_3166(self, column: str, pct: float = 1.0, options: Dict[str,str]={"name": "iso_3166"}):
+    def iso_3166(
+        self,
+        column: str,
+        pct: float = 1.0,
+        options: Dict[str, str] = {"name": "iso_3166"},
+    ):
         """Verifies that country codes are valid against the ISO standard 3166"""
         self._countries = _load_countries()
         self._check.is_contained_in(column, self._countries, pct, options=options)

diff --git a/cuallee/polars_validation.py b/cuallee/polars_validation.py
@@ -56,17 +56,15 @@ def is_unique(self, rule: Rule, dataframe: pl.DataFrame) -> Union[bool, int]:
         flag = False
         if rule.options and isinstance(rule.options, dict):
             flag = rule.options.get("ignore_nulls", False)
-        
+
         if flag:
             expr = expr.drop_nulls()
             extra = Compute._result(
                 dataframe.select(pl.col(rule.column).is_null().cast(pl.Int8)).sum()
             )
-        
+
         expr = expr.is_unique().cast(pl.Int8)
-        base = Compute._result(
-            dataframe.select(expr).sum()
-        )
+        base = Compute._result(dataframe.select(expr).sum())
 
         if flag:
             return base + extra

diff --git a/test/unit/bio_checks/test_duckdb.py b/test/unit/bio_checks/test_duckdb.py
@@ -2,39 +2,50 @@
 import polars as pl
 import duckdb
 
+
 def test_is_dna(check, db: duckdb.DuckDBPyConnection):
-    df = pl.DataFrame({"sequence" : ["ATGCCCTTTGGGTAA", "ATGCCCTTTGGGTAG", "ATGCCCTTTGGGTGA"]})
+    df = pl.DataFrame(
+        {"sequence": ["ATGCCCTTTGGGTAA", "ATGCCCTTTGGGTAG", "ATGCCCTTTGGGTGA"]}
+    )
     check.table_name = "df"
     check.bio.is_dna("sequence")
     assert check.validate(db).status.str.match("PASS").all()
 
+
 def test_is_not_dna(check, db: duckdb.DuckDBPyConnection):
-    df = pl.DataFrame({"sequence" : ["XXX", "YYY", "ZZZ"]})
+    df = pl.DataFrame({"sequence": ["XXX", "YYY", "ZZZ"]})
     check.table_name = "df"
     check.bio.is_dna("sequence")
     assert check.validate(db).status.str.match("FAIL").all()
 
+
 def test_is_cds(check, db: duckdb.DuckDBPyConnection):
-    df = pl.DataFrame({"sequence" : ["ATGCCCTTTGGGTAA", "ATGCCCTTTGGGTAG", "ATGCCCTTTGGGTGA"]})
+    df = pl.DataFrame(
+        {"sequence": ["ATGCCCTTTGGGTAA", "ATGCCCTTTGGGTAG", "ATGCCCTTTGGGTGA"]}
+    )
     check.table_name = "df"
     check.bio.is_cds("sequence")
     assert check.validate(db).status.str.match("PASS").all()
 
 
 def test_is_not_cds(check, db: duckdb.DuckDBPyConnection):
-    df = pl.DataFrame({"sequence" : ["ATGCCCTTTGGGTCC", "ATGCCCTTTGGGCCC", "ATGCCCTTTGGGTTT"]})
+    df = pl.DataFrame(
+        {"sequence": ["ATGCCCTTTGGGTCC", "ATGCCCTTTGGGCCC", "ATGCCCTTTGGGTTT"]}
+    )
     check.table_name = "df"
     check.bio.is_cds("sequence")
     assert check.validate(db).status.str.match("FAIL").all()
 
+
 def test_is_protein(check, db: duckdb.DuckDBPyConnection):
-    df = pl.DataFrame({"sequence" : ["ARND", "PSTW", "GHIL"]})
+    df = pl.DataFrame({"sequence": ["ARND", "PSTW", "GHIL"]})
     check.table_name = "df"
     check.bio.is_protein("sequence")
     assert check.validate(db).status.str.match("PASS").all()
 
+
 def test_is_not_protein(check, db: duckdb.DuckDBPyConnection):
-    df = pl.DataFrame({"sequence" : ["XXX", "OO1", "UU2"]})
+    df = pl.DataFrame({"sequence": ["XXX", "OO1", "UU2"]})
     check.table_name = "df"
     check.bio.is_protein("sequence")
-    assert check.validate(db).status.str.match("FAIL").all()
+    assert check.validate(db).status.str.match("FAIL").all()