Skip to content

Commit

Permalink
Rename all rule definitions
Browse files Browse the repository at this point in the history
* Remove serial numbers
* Remove measure/dimension distinction
* Add suffix which conventionally indicates the test being done
* Infer check_id from rule path

Signed-off-by: Joshua Kwan <[email protected]>
  • Loading branch information
joshk0 committed Nov 14, 2023
1 parent 6fe474a commit 4b2c595
Show file tree
Hide file tree
Showing 72 changed files with 136 additions and 222 deletions.
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import os
from itertools import groupby
from typing import Dict, List, Optional, Set, Union

Expand Down Expand Up @@ -151,7 +152,7 @@ def generate_pandera_schema(
for rule in rules:
if isinstance(rule, InvalidRule):
checklist[rule.rule_path] = ChecklistObject(
check_name=rule.rule_path,
check_name=os.path.splitext(os.path.basename(rule.rule_path))[0],
column_id="Unknown",
error=f"{rule.error_type}: {rule.error}",
status=ChecklistObjectStatus.ERRORED,
Expand Down
20 changes: 15 additions & 5 deletions focus_validator/config_objects/rule.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import os
from typing import Optional, Union

import yaml
Expand Down Expand Up @@ -46,6 +47,8 @@ def root_val(cls, values):
"""
Root validator that checks for all options passed in the config and generate missing options.
"""
if values is None:
values = {}

check = values.get("check")
check_friendly_name = values.get("check_friendly_name")
Expand All @@ -59,17 +62,19 @@ def root_val(cls, values):
check_type_friendly_name = check.__class__.__name__
values["check_type_friendly_name"] = check_type_friendly_name

if check_friendly_name is None and column_id is not None:
values["check_friendly_name"] = generate_check_friendly_name(
check=check, column_id=column_id
)
if check_friendly_name is None and column_id is not None:
values["check_friendly_name"] = generate_check_friendly_name(
check=check, column_id=column_id
)

return values

@staticmethod
def load_yaml(
rule_path, column_namespace: Optional[str] = None
) -> Union["Rule", InvalidRule]:
rule_path_basename = os.path.splitext(os.path.basename(rule_path))[0]

try:
with open(rule_path, "r") as f:
rule_obj = yaml.safe_load(f)
Expand All @@ -81,10 +86,15 @@ def load_yaml(
):
rule_obj["column"] = f"{column_namespace}:{rule_obj['column']}"

if isinstance(rule_obj, dict) and "check_id" not in rule_obj:
rule_obj["check_id"] = rule_path_basename

return Rule.model_validate(rule_obj)
except Exception as e:
return InvalidRule(
rule_path=rule_path, error=str(e), error_type=e.__class__.__name__
rule_path=rule_path_basename,
error=str(e),
error_type=e.__class__.__name__,
)


Expand Down
4 changes: 2 additions & 2 deletions focus_validator/data_loaders/data_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,13 +19,13 @@ def __init__(self, data_filename):
def find_data_loader(self):
file_mime_type = get_file_mime_type(self.data_filename)

if file_mime_type in ["ASCII text", "CSV text"]:
if file_mime_type in ["ASCII text", "CSV text", "CSV ASCII text"]:
return CSVDataLoader
elif file_mime_type == "Apache Parquet":
return ParquetDataLoader
else:
raise FocusNotImplementedError(
msg=f"Validator for file_type {file_mime_type} not implemented yet."
msg=f"Validator for file_type '{file_mime_type}' not implemented yet."
)

def load(self):
Expand Down
32 changes: 20 additions & 12 deletions focus_validator/outputter/outputter_console.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import math

import pandas as pd
from tabulate import tabulate

Expand Down Expand Up @@ -61,35 +62,42 @@ def write(self, result_set: ValidationResult):
print(tabulate(checklist, headers="keys", tablefmt="psql"))

if result_set.failure_cases is not None:
aggregated_failures = result_set.failure_cases.groupby(by=['Check Name', 'Column', 'Description'], as_index=False).aggregate(lambda x: maybe_collapse_range(x.unique().tolist()))
aggregated_failures = result_set.failure_cases.groupby(
by=["Check Name", "Column", "Description"], as_index=False
).aggregate(lambda x: collapse_occurrence_range(x.unique().tolist()))

print("Checks summary:")
print(
tabulate(
tabular_data=aggregated_failures, # type: ignore
tabular_data=aggregated_failures, # type: ignore
headers="keys",
tablefmt="psql",
)
)

def maybe_collapse_range(l):

def collapse_occurrence_range(occurrence_range: list):
start = None
i = None
collapsed = []
for n in sorted(l):
for n in sorted(occurrence_range):
if not isinstance(n, int) and not (isinstance(n, float) and not math.isnan(n)):
return l
return occurrence_range
elif i is None:
start = i = n
start = i = int(n)
elif n == i + 1:
i = n
i = int(n)
elif i:
if i == start: collapsed.append(f'{int(start)}')
else: collapsed.append(f'{int(start)}-{int(i)}')
start = i = n
if i == start:
collapsed.append(f"{start}")
else:
collapsed.append(f"{start}-{i}")
start = i = int(n)

if start is not None:
if i == start: collapsed.append(int(start))
else: collapsed.append(f'{int(start)}-{int(i)}')
if i == start:
collapsed.append(f"{start}")
else:
collapsed.append(f"{start}-{i}")

return collapsed
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
check_id: FV-M002-0001
column_id: AmortizedCost
check:
data_type: decimal
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
check_id: FV-M002-0002
column_id: AmortizedCost
check:
allow_nulls: false
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
check_id: FV-M002-0003
column_id: AmortizedCost
check:
column_required
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
check_id: FV-D014-0001
column_id: AvailabilityZone
check:
data_type: string
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
check_id: FV-D014-0002
column_id: AvailabilityZone
check:
allow_nulls: true
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
check_id: FV-M001-0001
column_id: BilledCost
check:
data_type: decimal
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
check_id: FV-M001-0002
column_id: BilledCost
check:
allow_nulls: false
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
check_id: FV-M001-0003
column_id: BilledCost
check:
column_required
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
check_id: FV-D010-0001
column_id: BilledCurrency
check:
data_type: currency-code
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
check_id: FV-D010-0002
column_id: BilledCurrency
check:
allow_nulls: false
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
check_id: FV-D010-0003
column_id: BilledCurrency
check:
column_required
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
check_id: FV-D006-0001
column_id: BillingAccountId
check:
data_type: string
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
check_id: FV-D006-0002
column_id: BillingAccountId
check:
allow_nulls: false
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
check_id: FV-D006-0003
column_id: BillingAccountId
check:
column_required
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
check_id: FV-D005-0001
column_id: BillingAccountName
check:
data_type: string
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
check_id: FV-D005-0002
column_id: BillingAccountName
check:
allow_nulls: True
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
check_id: FV-D005-0003
column_id: BillingAccountName
check:
column_required
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
check_id: FV-D011-0001
column_id: BillingPeriodEnd
check:
data_type: datetime
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
check_id: FV-D011-0002
column_id: BillingPeriodEnd
check:
allow_nulls: false
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
check_id: FV-D011-0003
column_id: BillingPeriodEnd
check:
column_required
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
check_id: FV-D012-0001
column_id: BillingPeriodStart
check:
data_type: datetime
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
check_id: FV-D012-0002
column_id: BillingPeriodStart
check:
allow_nulls: false
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
check_id: FV-D012-0003
column_id: BillingPeriodStart
check:
column_required
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
check_id: FV-D017-0001
column_id: ChargePeriodEnd
check:
data_type: datetime
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
check_id: FV-D017-0002
column_id: ChargePeriodEnd
check:
allow_nulls: false
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
check_id: FV-D017-0003
column_id: ChargePeriodEnd
check:
column_required
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
check_id: FV-D016-0001
column_id: ChargePeriodStart
check:
data_type: datetime
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
check_id: FV-D016-0002
column_id: ChargePeriodStart
check:
allow_nulls: false
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
check_id: FV-D016-0003
column_id: ChargePeriodStart
check:
column_required
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
check_id: FV-D001-0003
column_id: ChargeType
check:
value_in:
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
check_id: FV-D001-0001
column_id: ChargeType
check:
data_type: string
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
check_id: FV-D001-0004
column_id: ChargeType
check:
allow_nulls: false
Original file line number Diff line number Diff line change
@@ -1,3 +1,2 @@
check_id: FV-D001-0002
column_id: ChargeType
check: column_required
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
check_id: FV-D003-0001
column_id: InvoiceIssuer
check:
data_type: string
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
check_id: FV-D003-0002
column_id: InvoiceIssuer
check:
allow_nulls: false
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
check_id: FV-D004-0001
column_id: Provider
check:
data_type: string
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
check_id: FV-D004-0002
column_id: Provider
check:
allow_nulls: false
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
check_id: FV-D004-0003
column_id: Provider
check:
column_required
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
check_id: FV-D007-0001
column_id: Publisher
check:
data_type: string
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
check_id: FV-D007-0002
column_id: Publisher
check:
allow_nulls: false
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
check_id: FV-D007-0003
column_id: Publisher
check:
column_required
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
check_id: FV-D013-0001
column_id: Region
check:
data_type: string
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
check_id: FV-D013-0002
column_id: Region
check:
allow_nulls: false
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
check_id: FV-D013-0003
column_id: Region
check:
column_required
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
check_id: FV-D002-0001
column_id: ResourceID
check:
data_type: string
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
check_id: FV-D002-0002
column_id: ResourceID
check:
allow_nulls: true
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
check_id: FV-D008-0001
column_id: ResourceName
check:
data_type: string
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
check_id: FV-D008-0002
column_id: ResourceName
check:
allow_nulls: true
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
check_id: FV-D008-0003
column_id: ResourceName
check:
column_required
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
check_id: FV-D015-0004
column_id: ServiceCategory
check_friendly_name: "ServiceCategory must have a value defined in spec."
check:
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
check_id: FV-D015-0001
column_id: ServiceCategory
check:
data_type: string
Loading

0 comments on commit 4b2c595

Please sign in to comment.