From e38430869a5ced9c600f5a2c14a03db64ece1466 Mon Sep 17 00:00:00 2001
From: venu-sambarapu-DS <venu.sambarapu@factly.in>
Date: Tue, 4 Jun 2024 14:33:50 +0530
Subject: [PATCH] Modified the way of reading standard dataset

---
 app/api/api_v1/routers/dictionary.py | 25 +++++++++++++++----------
 app/utils/airline.py                 | 12 +++++-------
 app/utils/common.py                  | 16 ++++++++--------
 app/utils/geography.py               | 13 +++++--------
 app/utils/insurance.py               | 10 ++++------
 app/utils/metadata.py                | 23 ++++++++++-------------
 6 files changed, 47 insertions(+), 52 deletions(-)

diff --git a/app/api/api_v1/routers/dictionary.py b/app/api/api_v1/routers/dictionary.py
index c59bcad..f962ee5 100644
--- a/app/api/api_v1/routers/dictionary.py
+++ b/app/api/api_v1/routers/dictionary.py
@@ -18,16 +18,20 @@
 g_sheet_response = g_sheet_session.get("https://docs.google.com/spreadsheets/d/1NEsFJGr5IHsrIakGgeNFUvz5zpLOadh_vDH7Apqmv9E/gviz/tq?tqx=out:csv&sheet=master_dictionaries")
 g_sheet_bytes_data = g_sheet_response.content
 data = pd.read_csv(io.StringIO(g_sheet_bytes_data.decode('utf-8')))
-print("reading data from google sheet@@@@")
-# data.rename(
-#     columns={
-#         "country_standard_name": "country",
-#         "unique_standard_airline_name": "airline",
-#         "standard_disease_name": "disease",
-#         "psu_companies": "psu",
-#     }
-# )
-# print(data.columns.tolist())
+
+standard_data_values = data.copy()
+standard_data_values.rename(
+    columns={
+        "country_standard_name": "country",
+        "unique_standard_airline_name": "airline",
+        "standard_disease_name": "diseases",
+        "psu_companies": "psu",
+        "standard_district_name": "district",
+        "standard_states": "state",
+        "insurance_standard_names": "insurance_companies"
+    },
+    inplace=True,
+)
 
 
 @router.get("/", summary="Get all Saved Entities csv file name")
@@ -50,6 +54,7 @@ async def get_entity_data(entity: str):
     json_compatible_item_data = jsonable_encoder(
         entity_df.to_dict(orient="records")
     )
+    print(json_compatible_item_data)
     return JSONResponse(content=json_compatible_item_data)
 
 
diff --git a/app/utils/airline.py b/app/utils/airline.py
index 6578e3f..be39bd3 100644
--- a/app/utils/airline.py
+++ b/app/utils/airline.py
@@ -1,9 +1,9 @@
 import great_expectations as ge
 from fastapi.encoders import jsonable_encoder
-
-from app.core.config import APP_DIR, AirlineSettings, Settings
+from app.api.api_v1.routers.dictionary import standard_data_values
+from app.core.config import AirlineSettings, Settings
 from app.utils.column_mapping import find_airline_name_columns
-from app.utils.common import modify_values_to_be_in_set, read_pandas_dataset
+from app.utils.common import modify_values_to_be_in_set
 
 settings = Settings()
 airline_settings = AirlineSettings()
@@ -14,10 +14,8 @@ async def modify_airline_name_expectation_suite(
 ):
     default_expectation_suite = airline_settings.AIRLINE_NAME_EXPECTATION
 
-    airline_names_dataset = await read_pandas_dataset(
-        APP_DIR / "core" / "airline_names.csv"
-    )
-    airline_names_list = airline_names_dataset["airline_names"].tolist()
+    airline_names_dataset = standard_data_values[["airline"]].dropna()
+    airline_names_list = airline_names_dataset["airline"].tolist()
 
     changed_config = {
         "expect_column_values_to_be_in_set": {
diff --git a/app/utils/common.py b/app/utils/common.py
index e6c702e..28b4225 100644
--- a/app/utils/common.py
+++ b/app/utils/common.py
@@ -2,13 +2,13 @@
 import re
 from io import BytesIO
 from typing import Union
-
+# from app.api.api_v1.routers.dictionary import data as dictionary_data
 import great_expectations as ge
 import pandas as pd
 from charset_normalizer import from_bytes
 from fastapi.logger import logger
 
-from app.core.config import APP_DIR, GeographySettings
+from app.core.config import GeographySettings
 
 logging.basicConfig(level=logging.INFO)
 geographic_settings = GeographySettings()
@@ -79,12 +79,12 @@ async def read_pandas_dataset(source: str, **kwargs):
     return dataset
 
 
-async def load_values_to_be_in_set(domain: str):
-    # this function is used to load csv files, consisting values
-    # for states or country that are required to be in specific set
-    set_values_file = APP_DIR / "core" / f"{domain}.csv"
-    set_values = pd.read_csv(set_values_file)[f"{domain}"].unique()
-    return set_values
+# async def load_values_to_be_in_set(domain: str):
+#     # this function is used to load csv files, consisting values
+#     # for states or country that are required to be in specific set
+#     set_values_file = APP_DIR / "core" / f"{domain}.csv"
+#     set_values = pd.read_csv(set_values_file)[f"{domain}"].unique()
+#     return set_values
 
 
 async def modify_column_names_to_expectation_suite(
diff --git a/app/utils/geography.py b/app/utils/geography.py
index cceeacf..e876320 100644
--- a/app/utils/geography.py
+++ b/app/utils/geography.py
@@ -1,15 +1,14 @@
 import asyncio
 from collections import ChainMap
-
+from app.api.api_v1.routers.dictionary import standard_data_values
 import great_expectations as ge
 from fastapi.encoders import jsonable_encoder
 
-from app.core.config import APP_DIR, GeographySettings, Settings
+from app.core.config import GeographySettings, Settings
 from app.utils.column_mapping import find_geography_columns
 from app.utils.common import (
     modify_values_to_be_in_set,
     read_dataset,
-    read_pandas_dataset,
 )
 
 settings = Settings()
@@ -19,7 +18,7 @@
 async def modify_city_expectation_suite(column_name: str, result_format: str):
     default_expectation_suite = geograhy_setting.STATE_EXPECTATION
 
-    city_dataset = await read_pandas_dataset(APP_DIR / "core" / "district.csv")
+    city_dataset = standard_data_values[["district"]].dropna()
     city_list = city_dataset["districts"].tolist()
 
     changed_config = {
@@ -65,7 +64,7 @@ async def city_expectation_suite(dataset, result_format):
 async def modify_state_expectation_suite(column_name: str, result_format: str):
     default_expectation_suite = geograhy_setting.STATE_EXPECTATION
 
-    state_dataset = await read_pandas_dataset(APP_DIR / "core" / "state.csv")
+    state_dataset = standard_data_values[["state"]].dropna()
     state_list = state_dataset["state"].tolist()
 
     changed_config = {
@@ -112,9 +111,7 @@ async def modify_country_expectation_suite(
 ):
     default_expectation_suite = geograhy_setting.COUNTRY_EXPECTATION
 
-    country_dataset = await read_pandas_dataset(
-        APP_DIR / "core" / "country.csv"
-    )
+    country_dataset = standard_data_values[["country"]].dropna()
     country_list = country_dataset["country"].tolist()
 
     changed_config = {
diff --git a/app/utils/insurance.py b/app/utils/insurance.py
index c78dad9..93d64d3 100644
--- a/app/utils/insurance.py
+++ b/app/utils/insurance.py
@@ -1,9 +1,9 @@
 import great_expectations as ge
 from fastapi.encoders import jsonable_encoder
-
-from app.core.config import APP_DIR, InsuranceCompanySettings, Settings
+from app.api.api_v1.routers.dictionary import standard_data_values
+from app.core.config import InsuranceCompanySettings, Settings
 from app.utils.column_mapping import find_insurance_company_columns
-from app.utils.common import modify_values_to_be_in_set, read_pandas_dataset
+from app.utils.common import modify_values_to_be_in_set
 
 settings = Settings()
 insurance_company_settings = InsuranceCompanySettings()
@@ -16,9 +16,7 @@ async def modify_insurance_company_name_expectation_suite(
         insurance_company_settings.INSURANCE_COMPANY_NAME_EXPECTATION
     )
 
-    insurance_company_names_dataset = await read_pandas_dataset(
-        APP_DIR / "core" / "insurance_companies.csv"
-    )
+    insurance_company_names_dataset = standard_data_values[["insurance_companies"]]
     insurance_company_names_list = insurance_company_names_dataset[
         "insurance_companies"
     ].tolist()
diff --git a/app/utils/metadata.py b/app/utils/metadata.py
index c9699be..1c0745c 100644
--- a/app/utils/metadata.py
+++ b/app/utils/metadata.py
@@ -4,20 +4,22 @@
 import great_expectations as ge
 from fastapi.encoders import jsonable_encoder
 
-from app.core.config import APP_DIR, MetadataSettings, Settings
+from app.core.config import MetadataSettings, Settings
 from app.utils.column_mapping import find_metadata_columns
 from app.utils.common import (
     modify_values_to_be_in_set,
     modify_values_to_match_regex_list,
     read_dataset,
-    read_pandas_dataset,
 )
+from app.api.api_v1.routers.dictionary import standard_data_values
 from app.utils.general import general_metadata_expectation_suite
 from app.utils.tags import tags_expectation_suite
 from app.utils.unit import unit_expectation_suite
 
 settings = Settings()
 meta_data_setting = MetadataSettings()
+# todo: in future if we need short_form values from dictionary uncomment the following
+# short_form_dataset = standard_data_values[["short_form"]].dropna()
 
 
 async def modify_sector_expectation_suite(
@@ -26,7 +28,7 @@ async def modify_sector_expectation_suite(
 
     default_expectation_suite = meta_data_setting.SECTOR_EXPECTATION
 
-    sector_dataset = await read_pandas_dataset(APP_DIR / "core" / "sector.csv")
+    sector_dataset = standard_data_values[["sector"]].dropna()
     sector_list = sector_dataset["sector"].tolist()
 
     changed_config = {
@@ -86,9 +88,7 @@ async def modify_organization_expectation_suite(
 ):
     default_expectation_suite = meta_data_setting.ORGANIZATION_EXPECTATION
 
-    organization_dataset = await read_pandas_dataset(
-        APP_DIR / "core" / "organization.csv"
-    )
+    organization_dataset = standard_data_values[["organization"]].dropna()
     organization_list = organization_dataset["organization"].tolist()
 
     changed_config = {
@@ -148,10 +148,9 @@ async def modify_short_form_expectation_suite(
 ):
     default_expectation_suite = meta_data_setting.SHORT_FORM_EXPECTATION
 
-    short_form_dataset = await read_pandas_dataset(
-        APP_DIR / "core" / "short_form.csv"
-    )
-    short_form_list = short_form_dataset["short_form"].tolist()
+    # NOTE: Modify the short_form_expectation_suite to use short_form
+    short_form_dataset = {"short_form": ""}
+    short_form_list = short_form_dataset["short_form"]
 
     changed_config = {
         "expect_column_values_to_be_in_set": {
@@ -210,9 +209,7 @@ async def modify_frequency_of_update_expectation_suite(
         meta_data_setting.FREQUENCY_OF_UPDATE_EXPECTATION
     )
 
-    frequency_of_update_dataset = await read_pandas_dataset(
-        APP_DIR / "core" / "frequency_of_update.csv"
-    )
+    frequency_of_update_dataset = standard_data_values[["frequency_of_update"]].dropna()
     frequency_of_update_list = frequency_of_update_dataset[
         "frequency_of_update"
     ].tolist()