Skip to content

Commit

Permalink
Removed magic dependency and added github pipeline for windows,mac. (#…
Browse files Browse the repository at this point in the history
…105)

Signed-off-by: Varun Mittal <[email protected]>
  • Loading branch information
varunmittal91 authored Dec 6, 2023
1 parent 42d997e commit b38896d
Show file tree
Hide file tree
Showing 7 changed files with 34 additions and 43 deletions.
8 changes: 4 additions & 4 deletions .github/workflows/unittest.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -13,12 +13,12 @@ on:

jobs:
test:
runs-on: ubuntu-latest
runs-on: ${{ matrix.os }}

strategy:
matrix:
python-version: [ "3.8", "3.9", "3.10", "3.11" ]

python-version: [ "3.8", "3.9", "3.10", "3.11", "3.12" ]
os: [ windows-latest, ubuntu-latest, macos-latest ]
steps:
- uses: actions/checkout@v3
- name: Set up Python ${{ matrix.python-version }}
Expand All @@ -36,7 +36,7 @@ jobs:
name: Define a cache for the virtual environment based on the dependencies lock file
with:
path: ./.venv
key: venv-${{ hashFiles('poetry.lock') }}
key: venv-${{ hashFiles('poetry.lock') }}-${{ matrix.os }}-${{ matrix.python-version }}
- name: Install dependencies
run: |
poetry install
Expand Down
17 changes: 3 additions & 14 deletions focus_validator/data_loaders/data_loader.py
Original file line number Diff line number Diff line change
@@ -1,32 +1,21 @@
import magic

from focus_validator.data_loaders.csv_data_loader import CSVDataLoader
from focus_validator.data_loaders.parquet_data_loader import ParquetDataLoader
from focus_validator.exceptions import FocusNotImplementedError


def get_file_mime_type(filename):
f = magic.Magic(uncompress=True)
return f.from_file(filename=filename)


class DataLoader:
def __init__(self, data_filename):
self.data_filename = data_filename
self.data_loader_class = self.find_data_loader()
self.data_loader = self.data_loader_class(self.data_filename)

def find_data_loader(self):
file_mime_type = get_file_mime_type(self.data_filename)

if file_mime_type in ["ASCII text", "CSV text", "CSV ASCII text"]:
if self.data_filename.endswith(".csv"):
return CSVDataLoader
elif file_mime_type == "Apache Parquet":
elif self.data_filename.endswith(".parquet"):
return ParquetDataLoader
else:
raise FocusNotImplementedError(
msg=f"Validator for file_type '{file_mime_type}' not implemented yet."
)
raise FocusNotImplementedError("File type not implemented yet.")

def load(self):
return self.data_loader.load()
15 changes: 5 additions & 10 deletions focus_validator/rules/checks.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,16 +60,11 @@ def check_sql_query(df_groups, sql_query, column_alias):

# for the given indexes in false_indexes list, we are extracting the rows from the dataframe and
# add column_alias value to failure_case column and index to index column
failure_cases = df[df.index.isin(false_indexes)]
failure_cases["failure_case"] = df.apply(
lambda row: {column: row[column] for column in column_alias}, axis=1
)
failure_cases["failure_case"] = df.apply(
lambda row: ",".join(
[f"{column}:{row[column]}" for column in column_alias]
),
axis=1,
)
failure_cases = df[df.index.isin(false_indexes)].copy()
failure_cases.loc[:, "failure_case"] = [
",".join([f"{column}:{row[column]}" for column in column_alias])
for _, row in failure_cases.iterrows()
]

raise SchemaError(
schema=pa.DataFrameSchema(),
Expand Down
1 change: 0 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,6 @@ pandas = "^2"
tabulate = "*"
pyarrow = "*"
pydantic = "^2"
python-magic = "*"
pyyaml = "*"
requests = "*"
pandera = { version = "^0.17.2" }
Expand Down
20 changes: 12 additions & 8 deletions tests/attributes/test_datetime_column_load_from_csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,9 +81,10 @@ def test_load_column_with_valid_datetime_utc(self):

sample_df = pd.DataFrame([{random_column_id: utc_datetime}])

with tempfile.NamedTemporaryFile(suffix=".csv") as temp_file:
with tempfile.NamedTemporaryFile(suffix=".csv", mode="r+") as temp_file:
sample_df.to_csv(temp_file)
read_df = pd.read_csv(temp_file.name)
temp_file.seek(0)
read_df = pd.read_csv(temp_file)

self.__assert_values__(
random_column_id=random_column_id,
Expand All @@ -102,9 +103,10 @@ def test_load_column_with_valid_datetime_naive(self):

sample_df = pd.DataFrame([{random_column_id: naive_datetime}])

with tempfile.NamedTemporaryFile(suffix=".csv") as temp_file:
with tempfile.NamedTemporaryFile(suffix=".csv", mode="r+") as temp_file:
sample_df.to_csv(temp_file)
read_df = pd.read_csv(temp_file.name)
temp_file.seek(0)
read_df = pd.read_csv(temp_file)

self.__assert_values__(
random_column_id=random_column_id,
Expand All @@ -126,10 +128,11 @@ def test_load_column_with_valid_datetime_not_utc(self):
# generate random dataframe
sample_df = pd.DataFrame([{random_column_id: aware_datetime}])

with tempfile.NamedTemporaryFile(suffix=".csv") as temp_file:
with tempfile.NamedTemporaryFile(suffix=".csv", mode="r+") as temp_file:
# write csv to temporary location and read to simulate df read
sample_df.to_csv(temp_file)
read_df = pd.read_csv(temp_file.name)
temp_file.seek(0)
read_df = pd.read_csv(temp_file)

self.__assert_values__(
random_column_id=random_column_id,
Expand All @@ -150,10 +153,11 @@ def test_load_column_with_invalid_datetime(self):
# generate random dataframe
sample_df = pd.DataFrame([{random_column_id: bad_value}])

with tempfile.NamedTemporaryFile(suffix=".csv") as temp_file:
with tempfile.NamedTemporaryFile(suffix=".csv", mode="r+") as temp_file:
# write csv to temporary location and read to simulate df read
sample_df.to_csv(temp_file)
read_df = pd.read_csv(temp_file.name)
temp_file.seek(0)
read_df = pd.read_csv(temp_file)

self.__assert_values__(
random_column_id=random_column_id,
Expand Down
12 changes: 8 additions & 4 deletions tests/checks/test_sql_query_check_from_config.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import os
import tempfile
from unittest import TestCase

Expand Down Expand Up @@ -27,10 +28,13 @@

class TestSQLQueryCheckConfig(TestCase):
def test_config_from_yaml(self):
with tempfile.NamedTemporaryFile() as f:
f.write(YAML_CONFIG.encode())
f.seek(0)
rule = Rule.load_yaml(f.name)
with tempfile.TemporaryDirectory() as temp_dir:
sample_file_path = os.path.join(temp_dir, "D001_S001.yaml")

with open(sample_file_path, "w") as fd:
fd.write(YAML_CONFIG)

rule = Rule.load_yaml(sample_file_path)

dimension_checks = [
Rule(
Expand Down
4 changes: 2 additions & 2 deletions tests/data_loaders/test_null_value_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ def test_null_value_from_csv(self):
sample_data = pd.DataFrame([{"value": "NULL"}])

buffer = io.BytesIO()
sample_data.to_csv(buffer, index=False)
sample_data.to_csv(buffer, index=False, lineterminator="\n")

buffer.seek(0)
self.assertEqual(buffer.read(), b"value\nNULL\n")
Expand All @@ -27,7 +27,7 @@ def test_null_value_from_csv_with_missing_value(self):
sample_data = pd.DataFrame([{"value": None}])

buffer = io.BytesIO()
sample_data.to_csv(buffer, index=False)
sample_data.to_csv(buffer, index=False, lineterminator="\n")

buffer.seek(0)
self.assertEqual(buffer.read(), b'value\n""\n')
Expand Down

0 comments on commit b38896d

Please sign in to comment.