Removed magic dependency and added github pipeline for windows,mac. (#…

…105) Signed-off-by: Varun Mittal <[email protected]>
finopsfoundation · Dec 6, 2023 · b38896d · b38896d
1 parent 42d997e
commit b38896d
Show file tree

Hide file tree

Showing 7 changed files with 34 additions and 43 deletions.
diff --git a/.github/workflows/unittest.yaml b/.github/workflows/unittest.yaml
@@ -13,12 +13,12 @@ on:
 
 jobs:
   test:
-    runs-on: ubuntu-latest
+    runs-on: ${{ matrix.os }}
 
     strategy:
       matrix:
-        python-version: [ "3.8", "3.9", "3.10", "3.11" ]
-
+        python-version: [ "3.8", "3.9", "3.10", "3.11", "3.12" ]
+        os: [ windows-latest, ubuntu-latest, macos-latest ]
     steps:
       - uses: actions/checkout@v3
       - name: Set up Python ${{ matrix.python-version }}
@@ -36,7 +36,7 @@ jobs:
         name: Define a cache for the virtual environment based on the dependencies lock file
         with:
           path: ./.venv
-          key: venv-${{ hashFiles('poetry.lock') }}
+          key: venv-${{ hashFiles('poetry.lock') }}-${{ matrix.os }}-${{ matrix.python-version }}
       - name: Install dependencies
         run: |
           poetry install

diff --git a/focus_validator/data_loaders/data_loader.py b/focus_validator/data_loaders/data_loader.py
@@ -1,32 +1,21 @@
-import magic
-
 from focus_validator.data_loaders.csv_data_loader import CSVDataLoader
 from focus_validator.data_loaders.parquet_data_loader import ParquetDataLoader
 from focus_validator.exceptions import FocusNotImplementedError
 
 
-def get_file_mime_type(filename):
-    f = magic.Magic(uncompress=True)
-    return f.from_file(filename=filename)
-
-
 class DataLoader:
     def __init__(self, data_filename):
         self.data_filename = data_filename
         self.data_loader_class = self.find_data_loader()
         self.data_loader = self.data_loader_class(self.data_filename)
 
     def find_data_loader(self):
-        file_mime_type = get_file_mime_type(self.data_filename)
-
-        if file_mime_type in ["ASCII text", "CSV text", "CSV ASCII text"]:
+        if self.data_filename.endswith(".csv"):
             return CSVDataLoader
-        elif file_mime_type == "Apache Parquet":
+        elif self.data_filename.endswith(".parquet"):
             return ParquetDataLoader
         else:
-            raise FocusNotImplementedError(
-                msg=f"Validator for file_type '{file_mime_type}' not implemented yet."
-            )
+            raise FocusNotImplementedError("File type not implemented yet.")
 
     def load(self):
         return self.data_loader.load()
diff --git a/focus_validator/rules/checks.py b/focus_validator/rules/checks.py
@@ -60,16 +60,11 @@ def check_sql_query(df_groups, sql_query, column_alias):
 
         # for the given indexes in false_indexes list, we are extracting the rows from the dataframe and
         # add column_alias value to failure_case column and index to index column
-        failure_cases = df[df.index.isin(false_indexes)]
-        failure_cases["failure_case"] = df.apply(
-            lambda row: {column: row[column] for column in column_alias}, axis=1
-        )
-        failure_cases["failure_case"] = df.apply(
-            lambda row: ",".join(
-                [f"{column}:{row[column]}" for column in column_alias]
-            ),
-            axis=1,
-        )
+        failure_cases = df[df.index.isin(false_indexes)].copy()
+        failure_cases.loc[:, "failure_case"] = [
+            ",".join([f"{column}:{row[column]}" for column in column_alias])
+            for _, row in failure_cases.iterrows()
+        ]
 
         raise SchemaError(
             schema=pa.DataFrameSchema(),

diff --git a/pyproject.toml b/pyproject.toml
@@ -24,7 +24,6 @@ pandas = "^2"
 tabulate = "*"
 pyarrow = "*"
 pydantic = "^2"
-python-magic = "*"
 pyyaml = "*"
 requests = "*"
 pandera = { version = "^0.17.2" }

diff --git a/tests/attributes/test_datetime_column_load_from_csv.py b/tests/attributes/test_datetime_column_load_from_csv.py
@@ -81,9 +81,10 @@ def test_load_column_with_valid_datetime_utc(self):
 
         sample_df = pd.DataFrame([{random_column_id: utc_datetime}])
 
-        with tempfile.NamedTemporaryFile(suffix=".csv") as temp_file:
+        with tempfile.NamedTemporaryFile(suffix=".csv", mode="r+") as temp_file:
             sample_df.to_csv(temp_file)
-            read_df = pd.read_csv(temp_file.name)
+            temp_file.seek(0)
+            read_df = pd.read_csv(temp_file)
 
         self.__assert_values__(
             random_column_id=random_column_id,
@@ -102,9 +103,10 @@ def test_load_column_with_valid_datetime_naive(self):
 
         sample_df = pd.DataFrame([{random_column_id: naive_datetime}])
 
-        with tempfile.NamedTemporaryFile(suffix=".csv") as temp_file:
+        with tempfile.NamedTemporaryFile(suffix=".csv", mode="r+") as temp_file:
             sample_df.to_csv(temp_file)
-            read_df = pd.read_csv(temp_file.name)
+            temp_file.seek(0)
+            read_df = pd.read_csv(temp_file)
 
         self.__assert_values__(
             random_column_id=random_column_id,
@@ -126,10 +128,11 @@ def test_load_column_with_valid_datetime_not_utc(self):
         # generate random dataframe
         sample_df = pd.DataFrame([{random_column_id: aware_datetime}])
 
-        with tempfile.NamedTemporaryFile(suffix=".csv") as temp_file:
+        with tempfile.NamedTemporaryFile(suffix=".csv", mode="r+") as temp_file:
             # write csv to temporary location and read to simulate df read
             sample_df.to_csv(temp_file)
-            read_df = pd.read_csv(temp_file.name)
+            temp_file.seek(0)
+            read_df = pd.read_csv(temp_file)
 
         self.__assert_values__(
             random_column_id=random_column_id,
@@ -150,10 +153,11 @@ def test_load_column_with_invalid_datetime(self):
         # generate random dataframe
         sample_df = pd.DataFrame([{random_column_id: bad_value}])
 
-        with tempfile.NamedTemporaryFile(suffix=".csv") as temp_file:
+        with tempfile.NamedTemporaryFile(suffix=".csv", mode="r+") as temp_file:
             # write csv to temporary location and read to simulate df read
             sample_df.to_csv(temp_file)
-            read_df = pd.read_csv(temp_file.name)
+            temp_file.seek(0)
+            read_df = pd.read_csv(temp_file)
 
         self.__assert_values__(
             random_column_id=random_column_id,

diff --git a/tests/checks/test_sql_query_check_from_config.py b/tests/checks/test_sql_query_check_from_config.py
@@ -1,3 +1,4 @@
+import os
 import tempfile
 from unittest import TestCase
 
@@ -27,10 +28,13 @@
 
 class TestSQLQueryCheckConfig(TestCase):
     def test_config_from_yaml(self):
-        with tempfile.NamedTemporaryFile() as f:
-            f.write(YAML_CONFIG.encode())
-            f.seek(0)
-            rule = Rule.load_yaml(f.name)
+        with tempfile.TemporaryDirectory() as temp_dir:
+            sample_file_path = os.path.join(temp_dir, "D001_S001.yaml")
+
+            with open(sample_file_path, "w") as fd:
+                fd.write(YAML_CONFIG)
+
+            rule = Rule.load_yaml(sample_file_path)
 
         dimension_checks = [
             Rule(

diff --git a/tests/data_loaders/test_null_value_loader.py b/tests/data_loaders/test_null_value_loader.py
@@ -12,7 +12,7 @@ def test_null_value_from_csv(self):
         sample_data = pd.DataFrame([{"value": "NULL"}])
 
         buffer = io.BytesIO()
-        sample_data.to_csv(buffer, index=False)
+        sample_data.to_csv(buffer, index=False, lineterminator="\n")
 
         buffer.seek(0)
         self.assertEqual(buffer.read(), b"value\nNULL\n")
@@ -27,7 +27,7 @@ def test_null_value_from_csv_with_missing_value(self):
         sample_data = pd.DataFrame([{"value": None}])
 
         buffer = io.BytesIO()
-        sample_data.to_csv(buffer, index=False)
+        sample_data.to_csv(buffer, index=False, lineterminator="\n")
 
         buffer.seek(0)
         self.assertEqual(buffer.read(), b'value\n""\n')