From fbef98f5340279fc726b369a7ce879fd67ea1d1f Mon Sep 17 00:00:00 2001 From: Herminio Vazquez Date: Sat, 13 Jul 2024 12:28:04 +0200 Subject: [PATCH] Feature add biochecks (#282) * Modified pyspark validation to accomodate for custom functions * Added cds verification: * Added bio checks test cases * Added README updates --- README.md | 2 +- cuallee/__init__.py | 5 +-- cuallee/bio/checks.py | 9 ++++- cuallee/duckdb_validation.py | 8 ++++- cuallee/pyspark_validation.py | 10 +++--- test/unit/bio_checks/test_duckdb.py | 40 ++++++++++++++++++++++ test/unit/bio_checks/test_polars.py | 45 +++++++++++++++++++++++++ test/unit/bio_checks/test_pyspark.py | 50 ++++++++++++++++++++++++++++ 8 files changed, 160 insertions(+), 9 deletions(-) create mode 100644 test/unit/bio_checks/test_duckdb.py create mode 100644 test/unit/bio_checks/test_polars.py create mode 100644 test/unit/bio_checks/test_pyspark.py diff --git a/README.md b/README.md index 26ef1e2..b64a43e 100644 --- a/README.md +++ b/README.md @@ -35,7 +35,7 @@ Provider | API | Versions ![databricks](logos/databricks.svg?raw=true "PySpark DataFrame API")| `pyspark` & `spark-connect` |`3.5.x`, `3.4.0`, `3.3.x`, `3.2.x` ![bigquery](logos/bigquery.png?raw=true "BigQuery Client API")| `bigquery` | `3.4.1` ![pandas](logos/pandas.svg?raw=true "Pandas DataFrame API")| `pandas`| `2.0.2`, `1.5.x`, `1.4.x` -![duckdb](logos/duckdb.png?raw=true "DuckDB API")|`duckdb` | `0.10.2`,~~`0.9.2`~~,~~`0.8.0`~~, ~~`0.7.1`~~ +![duckdb](logos/duckdb.png?raw=true "DuckDB API")|`duckdb` | `1.0.0`,~~`0.10.2`~~,~~`0.9.2`~~,~~`0.8.0`~~, ~~`0.7.1`~~ ![polars](logos/polars.svg?raw=true "Polars API")|`polars`| `1.0.0`,~~`0.19.6`~~ ![daft](logos/daft.png?raw=true "Daft API")|`daft`| `0.2.24`, ~~`0.2.19`~~ diff --git a/cuallee/__init__.py b/cuallee/__init__.py index fc70f9c..8f83b88 100644 --- a/cuallee/__init__.py +++ b/cuallee/__init__.py @@ -142,7 +142,7 @@ def __post_init__(self): self.name = self.method def __repr__(self): - return f"Rule(method:{self.name}, column:{self.column}, value:{self.value}, data_type:{self.data_type}, coverage:{self.coverage}, status:{self.status}" + return f"Rule(method:{self.name}, column:{self.column}, value:{self.value}, data_type:{self.data_type}, coverage:{self.coverage}, ordinal:{self.ordinal}" def __rshift__(self, rule_dict: Dict[str, Any]) -> Dict[str, Any]: rule_dict[self.key] = self @@ -300,7 +300,8 @@ def add_rule(self, method: str, *arg, **kwargs): Args: method (str): Check name - arg (list): Parameters of the check + arg (list): Parameters of the Rule + kwars (dict): Dictionary of options for the Rule """ return operator.methodcaller(method, *arg, **kwargs)(self) diff --git a/cuallee/bio/checks.py b/cuallee/bio/checks.py index 0dc6ebe..bd6acee 100644 --- a/cuallee/bio/checks.py +++ b/cuallee/bio/checks.py @@ -19,8 +19,15 @@ def is_dna(self, column: str, pct: float = 1.0, options: Dict[str, str] = {"name return self._check def is_protein(self, column: str, pct: float = 1.0, options: Dict[str, str] = {"name": "is_protein"}): - """Verifies that country codes are valid against the ISO standard 3166""" + """Verifies that a sequence contains only valid aminoacid 1-letter codes""" self._check.has_pattern( column, rf"^[{''.join(self._aminoacids['1_letter_code'].tolist())}]*$", pct, options=options ) return self._check + + def is_cds(self, column: str, pct: float = 1.0, options: Dict[str, str] = {"name": "is_cds"}): + """Verifies that a sequence contains the correct codons""" + self._check.satisfies( + column, f"({column} rlike '^ATG.*') and ({column} rlike '.*(TAA|TAG|TGA)$') and (length({column}) % 3 == 0)", pct, options=options + ) + return self._check diff --git a/cuallee/duckdb_validation.py b/cuallee/duckdb_validation.py index 3938069..028ddf4 100644 --- a/cuallee/duckdb_validation.py +++ b/cuallee/duckdb_validation.py @@ -7,6 +7,7 @@ import pandas as pd # type: ignore from toolz import first # type: ignore from string import Template +import re from cuallee import Check, Rule @@ -106,7 +107,12 @@ def has_correlation(self, rule: Rule) -> str: return f"CORR({rule.column[0]}, {rule.column[1]}) = {rule.value}" def satisfies(self, rule: Rule) -> str: - return f"SUM(CAST(({rule.value}) AS INTEGER))" + """Allows arbitrary SQL statement execution as rules""" + + # Compatibility with other dataframe regular expression comparissons + expression = re.compile(re.escape("rlike"), re.IGNORECASE) + subquery = expression.sub('SIMILAR TO', rule.value) + return f"SUM(CAST(({subquery}) AS INTEGER))" def has_entropy(self, rule: Rule) -> str: return f"ENTROPY({rule.column}) = {rule.value}" diff --git a/cuallee/pyspark_validation.py b/cuallee/pyspark_validation.py index 63e0f5c..abe7a9a 100644 --- a/cuallee/pyspark_validation.py +++ b/cuallee/pyspark_validation.py @@ -801,11 +801,13 @@ def summary(check: Check, dataframe: DataFrame) -> DataFrame: spark = SparkSession.builder.getOrCreate() def _value(x): - """Removes verbosity for Callable values""" - if isinstance(x, Callable): + """Removes verbosity for Callable values""" + if x.options and isinstance(x.options, dict): + return x.options.get("custom_value", "f(x)") + elif isinstance(x.value, Callable): return "f(x)" else: - return str(x) + return str(x.value) # Compute the expression computed_expressions = compute(check._rule) @@ -845,7 +847,7 @@ def _value(x): check.level.name, str(rule.column), str(rule.name), - _value(rule.value), + _value(rule), int(check.rows), int(rule.violations), float(rule.pass_rate), diff --git a/test/unit/bio_checks/test_duckdb.py b/test/unit/bio_checks/test_duckdb.py new file mode 100644 index 0000000..be85e41 --- /dev/null +++ b/test/unit/bio_checks/test_duckdb.py @@ -0,0 +1,40 @@ +import pytest +import polars as pl +import duckdb + +def test_is_dna(check, db: duckdb.DuckDBPyConnection): + df = pl.DataFrame({"sequence" : ["ATGCCCTTTGGGTAA", "ATGCCCTTTGGGTAG", "ATGCCCTTTGGGTGA"]}) + check.table_name = "df" + check.bio.is_dna("sequence") + assert check.validate(db).status.str.match("PASS").all() + +def test_is_not_dna(check, db: duckdb.DuckDBPyConnection): + df = pl.DataFrame({"sequence" : ["XXX", "YYY", "ZZZ"]}) + check.table_name = "df" + check.bio.is_dna("sequence") + assert check.validate(db).status.str.match("FAIL").all() + +def test_is_cds(check, db: duckdb.DuckDBPyConnection): + df = pl.DataFrame({"sequence" : ["ATGCCCTTTGGGTAA", "ATGCCCTTTGGGTAG", "ATGCCCTTTGGGTGA"]}) + check.table_name = "df" + check.bio.is_cds("sequence") + assert check.validate(db).status.str.match("PASS").all() + + +def test_is_not_cds(check, db: duckdb.DuckDBPyConnection): + df = pl.DataFrame({"sequence" : ["ATGCCCTTTGGGTCC", "ATGCCCTTTGGGCCC", "ATGCCCTTTGGGTTT"]}) + check.table_name = "df" + check.bio.is_cds("sequence") + assert check.validate(db).status.str.match("FAIL").all() + +def test_is_protein(check, db: duckdb.DuckDBPyConnection): + df = pl.DataFrame({"sequence" : ["ARND", "PSTW", "GHIL"]}) + check.table_name = "df" + check.bio.is_protein("sequence") + assert check.validate(db).status.str.match("PASS").all() + +def test_is_not_protein(check, db: duckdb.DuckDBPyConnection): + df = pl.DataFrame({"sequence" : ["XXX", "OO1", "UU2"]}) + check.table_name = "df" + check.bio.is_protein("sequence") + assert check.validate(db).status.str.match("FAIL").all() \ No newline at end of file diff --git a/test/unit/bio_checks/test_polars.py b/test/unit/bio_checks/test_polars.py new file mode 100644 index 0000000..b2fdd74 --- /dev/null +++ b/test/unit/bio_checks/test_polars.py @@ -0,0 +1,45 @@ +import pytest +import polars as pl + +def test_is_dna(check): + df = pl.DataFrame({"sequence" : ["ATGCCCTTTGGGTAA", "ATGCCCTTTGGGTAG", "ATGCCCTTTGGGTGA"]}) + check.bio.is_dna("sequence") + rs = check.validate(df) + result = check.validate(df).select(pl.col("status")) == "PASS" + assert all(result.to_series().to_list()) + +def test_is_not_dna(check): + df = pl.DataFrame({"sequence" : ["XXX", "YYY", "ZZZ"]}) + check.bio.is_dna("sequence") + rs = check.validate(df) + result = check.validate(df).select(pl.col("status")) == "FAIL" + assert all(result.to_series().to_list()) + +def test_is_cds(check): + df = pl.DataFrame({"sequence" : ["ATGCCCTTTGGGTAA", "ATGCCCTTTGGGTAG", "ATGCCCTTTGGGTGA"]}) + check.bio.is_cds("sequence") + rs = check.validate(df) + result = check.validate(df).select(pl.col("status")) == "PASS" + assert all(result.to_series().to_list()) + + +def test_is_not_cds(check): + df = pl.DataFrame({"sequence" : ["ATGCCCTTTGGGTCC", "ATGCCCTTTGGGCCC", "ATGCCCTTTGGGTTT"]}) + check.bio.is_cds("sequence") + rs = check.validate(df) + result = check.validate(df).select(pl.col("status")) == "FAIL" + assert all(result.to_series().to_list()) + +def test_is_protein(check): + df = pl.DataFrame({"sequence" : ["ARND", "PSTW", "GHIL"]}) + check.bio.is_protein("sequence") + rs = check.validate(df) + result = check.validate(df).select(pl.col("status")) == "PASS" + assert all(result.to_series().to_list()) + +def test_is_not_protein(check): + df = pl.DataFrame({"sequence" : ["XXX", "OO1", "UU2"]}) + check.bio.is_protein("sequence") + rs = check.validate(df) + result = check.validate(df).select(pl.col("status")) == "FAIL" + assert all(result.to_series().to_list()) \ No newline at end of file diff --git a/test/unit/bio_checks/test_pyspark.py b/test/unit/bio_checks/test_pyspark.py new file mode 100644 index 0000000..084fd66 --- /dev/null +++ b/test/unit/bio_checks/test_pyspark.py @@ -0,0 +1,50 @@ +import pytest + +def test_is_dna(check, spark): + df = spark.createDataFrame([("ATGCCCTTTGGGTAA",), ("ATGCCCTTTGGGTAG",), ("ATGCCCTTTGGGTGA",)], schema="sequence string") + check.bio.is_dna("sequence") + rs = check.validate(df) + assert rs.first().status == "PASS" + assert rs.first().violations == 0 + assert rs.first().pass_threshold == 1.0 + +def test_is_not_dna(check, spark): + df = spark.createDataFrame([("XXX",), ("YYY",), ("ZZZ",)], schema="sequence string") + check.bio.is_dna("sequence") + rs = check.validate(df) + assert rs.first().status == "FAIL" + assert rs.first().violations == 3 + assert rs.first().pass_threshold == 1.0 + +def test_is_cds(check, spark): + df = spark.createDataFrame([("ATGCCCTTTGGGTAA",), ("ATGCCCTTTGGGTAG",), ("ATGCCCTTTGGGTGA",)], schema="sequence string") + check.bio.is_cds("sequence") + rs = check.validate(df) + assert rs.first().status == "PASS" + assert rs.first().violations == 0 + assert rs.first().pass_threshold == 1.0 + + +def test_is_not_cds(check, spark): + df = spark.createDataFrame([("ATGCCCTTTGGGTCC",), ("ATGCCCTTTGGGCCC",), ("ATGCCCTTTGGGTTT",)], schema="sequence string") + check.bio.is_cds("sequence") + rs = check.validate(df) + assert rs.first().status == "FAIL" + assert rs.first().violations == 3 + assert rs.first().pass_threshold == 1.0 + +def test_is_protein(check, spark): + df = spark.createDataFrame([("ARND",), ("PSTW",), ("GHIL",)], schema="sequence string") + check.bio.is_protein("sequence") + rs = check.validate(df) + assert rs.first().status == "PASS" + assert rs.first().violations == 0 + assert rs.first().pass_threshold == 1.0 + +def test_is_not_protein(check, spark): + df = spark.createDataFrame([("XXX",), ("OO1",), ("UU2",)], schema="sequence string") + check.bio.is_protein("sequence") + rs = check.validate(df) + assert rs.first().status == "FAIL" + assert rs.first().violations == 3 + assert rs.first().pass_threshold == 1.0 \ No newline at end of file