Merge pull request #41 from factly/fix/sheet_issues

Fix/sheet issues
factly · Mar 19, 2024 · 4023e9f · 4023e9f
2 parents 06ef96a + 2a121b0
commit 4023e9f
Show file tree

Hide file tree

Showing 9 changed files with 355 additions and 18 deletions.
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -1,14 +1,14 @@
 repos:
 -   repo: https://github.com/psf/black
-    rev: 22.3.0
+    rev: 24.3.0
     hooks:
     - id: black
       language_version: python3
 -   repo: https://github.com/pycqa/flake8
-    rev: 6.1.0
+    rev: 7.0.0
     hooks:
     - id: flake8
 -   repo: https://github.com/timothycrosley/isort
-    rev: 5.12.0
+    rev: 5.13.2
     hooks:
     - id: isort
diff --git a/app/api/api_v1/routers/metadata.py b/app/api/api_v1/routers/metadata.py
@@ -6,6 +6,7 @@
 
 from app.models.enums import ExpectationResultType
 from app.models.metadata_gsheet import MetadataGsheetRequest
+from app.utils.common import read_dataset
 from app.utils.gsheets import get_records_from_gsheets
 from app.utils.metadata import metadata_expectation_suite
 
@@ -20,16 +21,22 @@ async def execute_metadata_expectation_from_file(
         ExpectationResultType.SUMMARY,
         description="Level of Details for a Expectation result",
     ),
-    datasets: UploadFile = File(...),
+    file: UploadFile = File(...),
 ):
 
     # read the dataset from uploaded CSV file
-    logger.info(f"dataset: {datasets.filename}")
-    df = pd.read_csv(datasets.file)
+    logger.info(f"dataset: {file.filename}")
+    dataset = await read_dataset(file, is_file=True)
+    # df = pd.read_csv(datasets.file)
+
+    # # metadata expectation
+    # expectation = await metadata_expectation_suite(
+    #     df, result_type, dataset_name=datasets.filename
+    # )
 
     # metadata expectation
     expectation = await metadata_expectation_suite(
-        df, result_type, dataset_name=datasets.filename
+        dataset, result_type, dataset_name=file.filename
     )
 
     return expectation

diff --git a/app/core/config.py b/app/core/config.py
@@ -16,9 +16,11 @@ class Settings(BaseSettings):
     MODE: str = "development"
     DOCS_URL: str = "/api/docs"
     EXAMPLE_FOLDER: str = "/Users/somitragupta/factly/news-room-datasets"
-    EXAMPLE_URL: str = "/Users/somitragupta/factly/factly-datasets/projects/rbi/\
+    EXAMPLE_URL: str = (
+        "/Users/somitragupta/factly/factly-datasets/projects/rbi/\
 data/processed/1_timeseries/5_handbook-of-statistics-on-the-indian-economy/\
 hbs-mb-scb-select-aggregates-weekly/output.csv"
+    )
     EXAMPLE_URL_COUNTRY: str = """https://storage.factly.org/mande/\
 edu-ministry/data/processed/statistics/1_AISHE_report/19_enrolment_foreign/output.csv"""
     EXAMPLE_URL_STATE: str = """https://storage.factly.org/mande/edu-ministry/data/\
@@ -298,7 +300,7 @@ class NoteSettings(BaseSettings):
             {
                 "expectation_type": "expect_column_values_to_match_regex_list",
                 "kwargs": {
-                    "column": "unit",
+                    "column": "note",
                     "regex_list": [",?.+?:[^,]+[,]?"],
                     "result_format": "SUMMARY",
                 },
@@ -314,7 +316,33 @@ class NoteSettings(BaseSettings):
 
 class CustomExpectationsSettings(BaseSettings):
 
+    NULL_DATETIME_VALUE_NAME: str = "Null date values Flag - {column}"
+    NULL_DATETIME_VALUE_MSG: str = (
+        "Null values should not be permitted for datetime values"
+    )
+
     NUMERIC_COLUMNS_TYPES = ["float64", "int64"]
+    NUMERIC_VALUES_PATTERN = re.compile(r"^-?\d+(\.\d{1,2})?$")
+    NUMERIC_EXPECTATION_NAME: str = (
+        "Numeric values in specific pattern - {column}"
+    )
+    NUMERIC_EXPECTATION_ERR_MSG: str = (
+        "Numeric values should be in proper format both integer and float(roundoff to two decimal places)"
+    )
+
+    NEGATIVE_NUMERIC_VALUES_PATTERN = re.compile(r"^-\d+(\.\d{1,})?$")
+    NEGATIVE_NUMERIC_EXPECTATION_NAME: str = (
+        "Negative Numeric values Flag - {column}"
+    )
+    NEGATIVE_NUMERIC_EXPECTATION_ERR_MSG: str = (
+        "Flag Numeric values that are negative"
+    )
+
+    COLUMN_NAMES_PATTERN = re.compile(r"^[a-z]+(?:_[a-z]+)*$")
+    COLUMN_NAMES_EXPECTATION_NAME: str = "Column names in specific pattern"
+    COLUMN_NAMES_EXPECTATION_ERR_MSG: str = (
+        "Column names should be in lower case and separated by underscore - {column}"
+    )
 
     TRAIL_OR_LEAD_WHITESPACE_PATTERN = re.compile(r"^\s+.*|.*\s+$")
     LEADING_TRAILING_WHITE_SPACE_EXPECTATION_NAME: str = (
@@ -334,7 +362,9 @@ class CustomExpectationsSettings(BaseSettings):
     SPECIAL_CHARACTER_EXPECTATION_NAME: str = (
         "No special characters in Columns"
     )
-    SPECIAL_CHARACTER_EXPECTATION_ERR_MSG: str = "There should be no special character in the category name and measured value, like Telangana** , and any additional information  should be captured in notes instead of using a special character"
+    SPECIAL_CHARACTER_EXPECTATION_ERR_MSG: str = (
+        "There should be no special character in the category name and measured value, like Telangana** , and any additional information  should be captured in notes instead of using a special character"
+    )
 
     BRACKET_PATTERN = re.compile(r".*([\[\(].+?[\)\]]).*")
     BRACKETS_EXPECTATION_NAME: str = "No unnecessary brackets in Categories"
@@ -358,7 +388,9 @@ class CustomExpectationsSettings(BaseSettings):
 
     MINIMUM_DATASET_OBSERVATION_THRESH: int = 10
     OBSERVATIONS_MORE_THAN_THRESH_NAME: str = "Minimum required observation"
-    OBSERVATIONS_MORE_THAN_THRESH_MSG: str = "Generally the datasets must be more a threshold number of observation ({thresh})"
+    OBSERVATIONS_MORE_THAN_THRESH_MSG: str = (
+        "Generally the datasets must be more a threshold number of observation ({thresh})"
+    )
 
 
 class MetadataSettings(BaseSettings):
@@ -504,6 +536,11 @@ class MetadataSettings(BaseSettings):
         ],
     }
 
+    DESCRIPTION_NAME: str = "Description"
+    DESCRIPTION_ERROR_MSG: str = (
+        "Description should be in the range of 50 to 5000"
+    )
+
     TIME_SAVED_IN_HOURS_NAME: str = "Null values in columns - {column}"
     TIME_SAVED_IN_HOURS_MSG: str = (
         "Null values should not present in these columns"

diff --git a/app/core/sector.csv b/app/core/sector.csv
@@ -50,4 +50,3 @@ Youth and Sports
 Banking
 Trade
 Water Resources
-Youth and Sports
diff --git a/app/expectations/custom_expectations.py b/app/expectations/custom_expectations.py
@@ -1,13 +1,16 @@
+import logging
 from datetime import date
 
 import numpy as np
+import pandas as pd
 from great_expectations.dataset import MetaPandasDataset, PandasDataset
 
 from app.core.config import CustomExpectationsSettings
 
 custom_expectation_settings = CustomExpectationsSettings()
 
 CURRENT_YEAR = str(date.today().year)
+logging.basicConfig(level=logging.INFO)
 
 
 class GenericCustomExpectations(PandasDataset):
@@ -78,3 +81,56 @@ def expect_multicolumn_dataset_to_have_more_than_x_rows(self, column_list):
             ),
             length,
         )
+
+    @MetaPandasDataset.multicolumn_map_expectation
+    def expect_numerical_values_to_be_in_specific_pattern(
+        self,
+        column_list,
+        pattern=custom_expectation_settings.NUMERIC_VALUES_PATTERN,
+        meta={
+            "expectation_name": "Numeric values in specific pattern",
+        },
+        include_meta=True,
+    ):
+        bool_list = column_list.applymap(
+            lambda x: True if pattern.match(str(x)) else False
+        )
+        return bool_list[bool_list.columns[0]]
+
+    @MetaPandasDataset.multicolumn_map_expectation
+    def flag_negative_numerical_values(
+        self,
+        column_list,
+        pattern=custom_expectation_settings.NEGATIVE_NUMERIC_VALUES_PATTERN,
+        meta={
+            "expectation_name": "Negative Numeric values Flag",
+        },
+        include_meta=True,
+    ):
+        bool_list = column_list.applymap(
+            lambda x: False if pattern.match(str(x)) else True
+        )
+        return bool_list[bool_list.columns[0]]
+
+    @MetaPandasDataset.multicolumn_map_expectation
+    def expect_column_names_to_be_in_specific_pattern(
+        self,
+        column_list,
+        pattern=custom_expectation_settings.COLUMN_NAMES_PATTERN,
+        meta={
+            "expectation_name": "Values in specific pattern",
+        },
+        include_meta=True,
+        find_columns=False,
+    ):
+        boolean_list = pd.Series(column_list.columns).apply(
+            lambda x: True if pattern.match(str(x)) else False
+        )
+        # improper_column_list = [
+        #     column
+        #     for column, boolean in zip(column_list.columns, boolean_list)
+        #     if not boolean
+        # ]
+        # logging.info(boolean_list.all())
+
+        return boolean_list.all()
diff --git a/app/utils/common.py b/app/utils/common.py
@@ -25,7 +25,11 @@ def get_encoding(obj):
 
 
 async def read_dataset(
-    source: str, s3_client=None, bucket_name: Union[str, None] = None, **kwargs
+    source: str,
+    s3_client=None,
+    bucket_name: Union[str, None] = None,
+    is_file: bool = False,
+    **kwargs,
 ) -> ge.dataset.pandas_dataset.PandasDataset:
     if s3_client:
         # dataset should be downloaded from s3 storage
@@ -42,7 +46,19 @@ async def read_dataset(
         finally:
             response.close()
             response.release_conn()
-
+    elif is_file:
+        try:
+            file = source.file.read()
+            dataset = ge.read_csv(BytesIO(file))
+            logger.info(f"Dataset read from : {source.filename}")
+        except UnicodeDecodeError:
+            encoding = get_encoding(obj=file)
+            dataset = ge.read_csv(BytesIO(file), encoding=encoding)
+            logger.info(
+                f"Dataset read from : {source.filename} with non-utf8 encoding"
+            )
+        except Exception as e:
+            logger.info(f"Error reading Dataset from : {source.filename}: {e}")
     else:
         session = kwargs.pop("session")
         try:
@@ -96,6 +112,20 @@ async def modify_default_expectation_suite(
     return expectation_suite
 
 
+async def modify_values_to_be_in_between(
+    changed_config: dict, default_config: str
+):
+    for expectation in default_config["expectations"]:
+        if (
+            expectation["expectation_type"]
+            == "expect_column_values_to_be_between"
+        ):
+            expectation["kwargs"].update(
+                changed_config["expect_column_values_to_be_between"]
+            )
+    return default_config
+
+
 async def modify_values_to_be_in_set(
     changed_config: dict, default_config: str
 ):
-Original file line number
+Diff line change
@@ Expand Up / @@ -50,4 +50,3 @@ Youth and Sports @@
     Banking
     Trade
     Water Resources
-    Youth and Sports