Skip to content

Commit

Permalink
Formatted standard
Browse files Browse the repository at this point in the history
  • Loading branch information
canimus committed Jul 14, 2024
1 parent 9039b51 commit 390026f
Show file tree
Hide file tree
Showing 9 changed files with 201 additions and 47 deletions.
115 changes: 100 additions & 15 deletions cuallee/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -136,7 +136,11 @@ def __post_init__(self):
if len(Counter(map(type, self.value)).keys()) > 1:
raise ValueError("Data types in rule values are inconsistent")

if (self.options and isinstance(self.options, dict) and (rule_name := self.options.get("name"))):
if (
self.options
and isinstance(self.options, dict)
and (rule_name := self.options.get("name"))
):
self.name = rule_name
else:
self.name = self.method
Expand Down Expand Up @@ -393,7 +397,13 @@ def are_complete(self, column: Union[List[str], Tuple[str, str]], pct: float = 1
Rule("are_complete", column, "N/A", CheckDataType.AGNOSTIC, pct) >> self._rule
return self

def is_unique(self, column: str, pct: float = 1.0, approximate: bool = False, ignore_nulls: bool = False):
def is_unique(
self,
column: str,
pct: float = 1.0,
approximate: bool = False,
ignore_nulls: bool = False,
):
"""
Validation for unique values in column
Expand Down Expand Up @@ -424,7 +434,17 @@ def is_primary_key(self, column: str, pct: float = 1.0):
column (str): Column name in dataframe
pct (float): The threshold percentage required to pass
"""
Rule("is_unique", column, "N/A", CheckDataType.AGNOSTIC, pct, options={"name" : "is_primary_key"}) >> self._rule
(
Rule(
"is_unique",
column,
"N/A",
CheckDataType.AGNOSTIC,
pct,
options={"name": "is_primary_key"},
)
>> self._rule
)
return self

def are_unique(self, column: Union[List[str], Tuple[str, str]], pct: float = 1.0):
Expand All @@ -448,7 +468,17 @@ def is_composite_key(
column (str): Column name in dataframe
pct (float): The threshold percentage required to pass
"""
Rule("are_unique", column, "N/A", CheckDataType.AGNOSTIC, pct, options={"name" : "is_composite_key"}) >> self._rule
(
Rule(
"are_unique",
column,
"N/A",
CheckDataType.AGNOSTIC,
pct,
options={"name": "is_composite_key"},
)
>> self._rule
)
return self

def is_greater_than(self, column: str, value: float, pct: float = 1.0):
Expand Down Expand Up @@ -557,7 +587,9 @@ def is_equal_than(self, column: str, value: float, pct: float = 1.0):
Rule("is_equal_than", column, value, CheckDataType.NUMERIC, pct) >> self._rule
return self

def has_pattern(self, column: str, value: str, pct: float = 1.0, options: Dict[str, str] = {}):
def has_pattern(
self, column: str, value: str, pct: float = 1.0, options: Dict[str, str] = {}
):
"""
Validation for string type column matching regex expression
Expand All @@ -566,7 +598,12 @@ def has_pattern(self, column: str, value: str, pct: float = 1.0, options: Dict[s
value (regex): A regular expression used to match values in the `column`
pct (float): The threshold percentage required to pass
"""
Rule("has_pattern", column, value, CheckDataType.STRING, pct, options=options) >> self._rule
(
Rule(
"has_pattern", column, value, CheckDataType.STRING, pct, options=options
)
>> self._rule
)
return self

def is_legit(self, column: str, pct: float = 1.0):
Expand All @@ -581,7 +618,17 @@ def is_legit(self, column: str, pct: float = 1.0):
column (str): Column name in dataframe
pct (float): The threshold percentage required to pass
"""
Rule("has_pattern", column, r"^\S+$", CheckDataType.STRING, pct, options={"name" : "is_legit"}) >> self._rule
(
Rule(
"has_pattern",
column,
r"^\S+$",
CheckDataType.STRING,
pct,
options={"name": "is_legit"},
)
>> self._rule
)
return self

def has_min(self, column: str, value: float):
Expand Down Expand Up @@ -688,7 +735,7 @@ def is_contained_in(
column: str,
value: Union[List, Tuple],
pct: float = 1.0,
options : Dict[str, str] = {}
options: Dict[str, str] = {},
):
"""
Validation of column value in set of given values
Expand All @@ -700,7 +747,14 @@ def is_contained_in(
"""

(
Rule("is_contained_in", column, value, CheckDataType.AGNOSTIC, pct, options=options)
Rule(
"is_contained_in",
column,
value,
CheckDataType.AGNOSTIC,
pct,
options=options,
)
>> self._rule
)

Expand All @@ -717,7 +771,13 @@ def is_in(self, column: str, value: Tuple[str, int, float], pct: float = 1.0):
"""
return self.is_contained_in(column, value, pct, options={"name": "is_in"})

def is_t_minus_n(self, column: str, value: int, pct: float = 1.0, options: Dict[str, str] = {"name": "is_t_minus_n"}):
def is_t_minus_n(
self,
column: str,
value: int,
pct: float = 1.0,
options: Dict[str, str] = {"name": "is_t_minus_n"},
):
"""
Validate that date is `n` days before the current date
Expand All @@ -727,7 +787,9 @@ def is_t_minus_n(self, column: str, value: int, pct: float = 1.0, options: Dict[
pct (float): The threshold percentage required to pass
"""
yesterday = datetime.utcnow() - timedelta(days=value)
return self.is_contained_in(column, tuple([yesterday.strftime("%Y-%m-%d")]), pct, options=options)
return self.is_contained_in(
column, tuple([yesterday.strftime("%Y-%m-%d")]), pct, options=options
)

def is_t_minus_1(self, column: str, pct: float = 1.0):
"""
Expand Down Expand Up @@ -894,7 +956,13 @@ def has_correlation(self, column_left: str, column_right: str, value: float):
)
return self

def satisfies(self, column: str, predicate: str, pct: float = 1.0, options: Dict[str, str] = {}):
def satisfies(
self,
column: str,
predicate: str,
pct: float = 1.0,
options: Dict[str, str] = {},
):
"""
Validation of a column satisfying a SQL-like predicate
Expand All @@ -903,7 +971,17 @@ def satisfies(self, column: str, predicate: str, pct: float = 1.0, options: Dict
predicate (str): A predicate written in SQL-like syntax
pct (float): The threshold percentage required to pass
"""
Rule("satisfies", column, predicate, CheckDataType.AGNOSTIC, pct, options=options) >> self._rule
(
Rule(
"satisfies",
column,
predicate,
CheckDataType.AGNOSTIC,
pct,
options=options,
)
>> self._rule
)
return self

def has_cardinality(self, column: str, value: int):
Expand Down Expand Up @@ -1180,7 +1258,11 @@ def has_workflow(
return self

def is_custom(
self, column: Union[str, List[str]], fn: Callable = None, pct: float = 1.0, options: Dict[str, str] = {}
self,
column: Union[str, List[str]],
fn: Callable = None,
pct: float = 1.0,
options: Dict[str, str] = {},
):
"""
Uses a user-defined function that receives the to-be-validated dataframe
Expand All @@ -1192,7 +1274,10 @@ def is_custom(
pct (float): The threshold percentage required to pass
"""

(Rule("is_custom", column, fn, CheckDataType.AGNOSTIC, pct, options=options) >> self._rule)
(
Rule("is_custom", column, fn, CheckDataType.AGNOSTIC, pct, options=options)
>> self._rule
)
return self

def validate(self, dataframe: Any):
Expand Down
31 changes: 26 additions & 5 deletions cuallee/bio/checks.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,21 +13,42 @@ def __init__(self, check: Check):
except Exception:
raise Exception("Unable to load aminoacid definitions")

def is_dna(self, column: str, pct: float = 1.0, options: Dict[str, str] = {"name" : "is_dna"}):
def is_dna(
self,
column: str,
pct: float = 1.0,
options: Dict[str, str] = {"name": "is_dna"},
):
"""Validates that a sequence contains only valid nucleotide bases of DNA strand"""
self._check.has_pattern(column, r"^[GTCA]*$", pct, options=options)
return self._check

def is_protein(self, column: str, pct: float = 1.0, options: Dict[str, str] = {"name": "is_protein"}):
def is_protein(
self,
column: str,
pct: float = 1.0,
options: Dict[str, str] = {"name": "is_protein"},
):
"""Verifies that a sequence contains only valid aminoacid 1-letter codes"""
self._check.has_pattern(
column, rf"^[{''.join(self._aminoacids['1_letter_code'].tolist())}]*$", pct, options=options
column,
rf"^[{''.join(self._aminoacids['1_letter_code'].tolist())}]*$",
pct,
options=options,
)
return self._check

def is_cds(self, column: str, pct: float = 1.0, options: Dict[str, str] = {"name": "is_cds"}):
def is_cds(
self,
column: str,
pct: float = 1.0,
options: Dict[str, str] = {"name": "is_cds"},
):
"""Verifies that a sequence contains the correct codons"""
self._check.satisfies(
column, f"({column} rlike '^ATG.*') and ({column} rlike '.*(TAA|TAG|TGA)$') and (length({column}) % 3 == 0)", pct, options=options
column,
f"({column} rlike '^ATG.*') and ({column} rlike '.*(TAA|TAG|TGA)$') and (length({column}) % 3 == 0)",
pct,
options=options,
)
return self._check
2 changes: 1 addition & 1 deletion cuallee/duckdb_validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,7 @@ def satisfies(self, rule: Rule) -> str:

# Compatibility with other dataframe regular expression comparissons
expression = re.compile(re.escape("rlike"), re.IGNORECASE)
subquery = expression.sub('SIMILAR TO', rule.value)
subquery = expression.sub("SIMILAR TO", rule.value)
return f"SUM(CAST(({subquery}) AS INTEGER))"

def has_entropy(self, rule: Rule) -> str:
Expand Down
14 changes: 12 additions & 2 deletions cuallee/iso/checks.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,13 +52,23 @@ def __init__(self, check):
self._ccy = []
self._countries = []

def iso_4217(self, column: str, pct: float = 1.0, options: Dict[str,str]={"name": "iso_4217"}):
def iso_4217(
self,
column: str,
pct: float = 1.0,
options: Dict[str, str] = {"name": "iso_4217"},
):
"""It verifies a field against the international standard currency codes via code or number fields from ISO 4217"""
self._ccy = _load_currencies()
self._check.is_contained_in(column, self._ccy, pct, options=options)
return self._check

def iso_3166(self, column: str, pct: float = 1.0, options: Dict[str,str]={"name": "iso_3166"}):
def iso_3166(
self,
column: str,
pct: float = 1.0,
options: Dict[str, str] = {"name": "iso_3166"},
):
"""Verifies that country codes are valid against the ISO standard 3166"""
self._countries = _load_countries()
self._check.is_contained_in(column, self._countries, pct, options=options)
Expand Down
8 changes: 3 additions & 5 deletions cuallee/polars_validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,17 +56,15 @@ def is_unique(self, rule: Rule, dataframe: pl.DataFrame) -> Union[bool, int]:
flag = False
if rule.options and isinstance(rule.options, dict):
flag = rule.options.get("ignore_nulls", False)

if flag:
expr = expr.drop_nulls()
extra = Compute._result(
dataframe.select(pl.col(rule.column).is_null().cast(pl.Int8)).sum()
)

expr = expr.is_unique().cast(pl.Int8)
base = Compute._result(
dataframe.select(expr).sum()
)
base = Compute._result(dataframe.select(expr).sum())

if flag:
return base + extra
Expand Down
25 changes: 18 additions & 7 deletions test/unit/bio_checks/test_duckdb.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,39 +2,50 @@
import polars as pl
import duckdb


def test_is_dna(check, db: duckdb.DuckDBPyConnection):
df = pl.DataFrame({"sequence" : ["ATGCCCTTTGGGTAA", "ATGCCCTTTGGGTAG", "ATGCCCTTTGGGTGA"]})
df = pl.DataFrame(
{"sequence": ["ATGCCCTTTGGGTAA", "ATGCCCTTTGGGTAG", "ATGCCCTTTGGGTGA"]}
)
check.table_name = "df"
check.bio.is_dna("sequence")
assert check.validate(db).status.str.match("PASS").all()


def test_is_not_dna(check, db: duckdb.DuckDBPyConnection):
df = pl.DataFrame({"sequence" : ["XXX", "YYY", "ZZZ"]})
df = pl.DataFrame({"sequence": ["XXX", "YYY", "ZZZ"]})
check.table_name = "df"
check.bio.is_dna("sequence")
assert check.validate(db).status.str.match("FAIL").all()


def test_is_cds(check, db: duckdb.DuckDBPyConnection):
df = pl.DataFrame({"sequence" : ["ATGCCCTTTGGGTAA", "ATGCCCTTTGGGTAG", "ATGCCCTTTGGGTGA"]})
df = pl.DataFrame(
{"sequence": ["ATGCCCTTTGGGTAA", "ATGCCCTTTGGGTAG", "ATGCCCTTTGGGTGA"]}
)
check.table_name = "df"
check.bio.is_cds("sequence")
assert check.validate(db).status.str.match("PASS").all()


def test_is_not_cds(check, db: duckdb.DuckDBPyConnection):
df = pl.DataFrame({"sequence" : ["ATGCCCTTTGGGTCC", "ATGCCCTTTGGGCCC", "ATGCCCTTTGGGTTT"]})
df = pl.DataFrame(
{"sequence": ["ATGCCCTTTGGGTCC", "ATGCCCTTTGGGCCC", "ATGCCCTTTGGGTTT"]}
)
check.table_name = "df"
check.bio.is_cds("sequence")
assert check.validate(db).status.str.match("FAIL").all()


def test_is_protein(check, db: duckdb.DuckDBPyConnection):
df = pl.DataFrame({"sequence" : ["ARND", "PSTW", "GHIL"]})
df = pl.DataFrame({"sequence": ["ARND", "PSTW", "GHIL"]})
check.table_name = "df"
check.bio.is_protein("sequence")
assert check.validate(db).status.str.match("PASS").all()


def test_is_not_protein(check, db: duckdb.DuckDBPyConnection):
df = pl.DataFrame({"sequence" : ["XXX", "OO1", "UU2"]})
df = pl.DataFrame({"sequence": ["XXX", "OO1", "UU2"]})
check.table_name = "df"
check.bio.is_protein("sequence")
assert check.validate(db).status.str.match("FAIL").all()
assert check.validate(db).status.str.match("FAIL").all()
Loading

0 comments on commit 390026f

Please sign in to comment.