From e38430869a5ced9c600f5a2c14a03db64ece1466 Mon Sep 17 00:00:00 2001 From: venu-sambarapu-DS Date: Tue, 4 Jun 2024 14:33:50 +0530 Subject: [PATCH] Modified the way of reading standard dataset --- app/api/api_v1/routers/dictionary.py | 25 +++++++++++++++---------- app/utils/airline.py | 12 +++++------- app/utils/common.py | 16 ++++++++-------- app/utils/geography.py | 13 +++++-------- app/utils/insurance.py | 10 ++++------ app/utils/metadata.py | 23 ++++++++++------------- 6 files changed, 47 insertions(+), 52 deletions(-) diff --git a/app/api/api_v1/routers/dictionary.py b/app/api/api_v1/routers/dictionary.py index c59bcad..f962ee5 100644 --- a/app/api/api_v1/routers/dictionary.py +++ b/app/api/api_v1/routers/dictionary.py @@ -18,16 +18,20 @@ g_sheet_response = g_sheet_session.get("https://docs.google.com/spreadsheets/d/1NEsFJGr5IHsrIakGgeNFUvz5zpLOadh_vDH7Apqmv9E/gviz/tq?tqx=out:csv&sheet=master_dictionaries") g_sheet_bytes_data = g_sheet_response.content data = pd.read_csv(io.StringIO(g_sheet_bytes_data.decode('utf-8'))) -print("reading data from google sheet@@@@") -# data.rename( -# columns={ -# "country_standard_name": "country", -# "unique_standard_airline_name": "airline", -# "standard_disease_name": "disease", -# "psu_companies": "psu", -# } -# ) -# print(data.columns.tolist()) + +standard_data_values = data.copy() +standard_data_values.rename( + columns={ + "country_standard_name": "country", + "unique_standard_airline_name": "airline", + "standard_disease_name": "diseases", + "psu_companies": "psu", + "standard_district_name": "district", + "standard_states": "state", + "insurance_standard_names": "insurance_companies" + }, + inplace=True, +) @router.get("/", summary="Get all Saved Entities csv file name") @@ -50,6 +54,7 @@ async def get_entity_data(entity: str): json_compatible_item_data = jsonable_encoder( entity_df.to_dict(orient="records") ) + print(json_compatible_item_data) return JSONResponse(content=json_compatible_item_data) diff --git a/app/utils/airline.py b/app/utils/airline.py index 6578e3f..be39bd3 100644 --- a/app/utils/airline.py +++ b/app/utils/airline.py @@ -1,9 +1,9 @@ import great_expectations as ge from fastapi.encoders import jsonable_encoder - -from app.core.config import APP_DIR, AirlineSettings, Settings +from app.api.api_v1.routers.dictionary import standard_data_values +from app.core.config import AirlineSettings, Settings from app.utils.column_mapping import find_airline_name_columns -from app.utils.common import modify_values_to_be_in_set, read_pandas_dataset +from app.utils.common import modify_values_to_be_in_set settings = Settings() airline_settings = AirlineSettings() @@ -14,10 +14,8 @@ async def modify_airline_name_expectation_suite( ): default_expectation_suite = airline_settings.AIRLINE_NAME_EXPECTATION - airline_names_dataset = await read_pandas_dataset( - APP_DIR / "core" / "airline_names.csv" - ) - airline_names_list = airline_names_dataset["airline_names"].tolist() + airline_names_dataset = standard_data_values[["airline"]].dropna() + airline_names_list = airline_names_dataset["airline"].tolist() changed_config = { "expect_column_values_to_be_in_set": { diff --git a/app/utils/common.py b/app/utils/common.py index e6c702e..28b4225 100644 --- a/app/utils/common.py +++ b/app/utils/common.py @@ -2,13 +2,13 @@ import re from io import BytesIO from typing import Union - +# from app.api.api_v1.routers.dictionary import data as dictionary_data import great_expectations as ge import pandas as pd from charset_normalizer import from_bytes from fastapi.logger import logger -from app.core.config import APP_DIR, GeographySettings +from app.core.config import GeographySettings logging.basicConfig(level=logging.INFO) geographic_settings = GeographySettings() @@ -79,12 +79,12 @@ async def read_pandas_dataset(source: str, **kwargs): return dataset -async def load_values_to_be_in_set(domain: str): - # this function is used to load csv files, consisting values - # for states or country that are required to be in specific set - set_values_file = APP_DIR / "core" / f"{domain}.csv" - set_values = pd.read_csv(set_values_file)[f"{domain}"].unique() - return set_values +# async def load_values_to_be_in_set(domain: str): +# # this function is used to load csv files, consisting values +# # for states or country that are required to be in specific set +# set_values_file = APP_DIR / "core" / f"{domain}.csv" +# set_values = pd.read_csv(set_values_file)[f"{domain}"].unique() +# return set_values async def modify_column_names_to_expectation_suite( diff --git a/app/utils/geography.py b/app/utils/geography.py index cceeacf..e876320 100644 --- a/app/utils/geography.py +++ b/app/utils/geography.py @@ -1,15 +1,14 @@ import asyncio from collections import ChainMap - +from app.api.api_v1.routers.dictionary import standard_data_values import great_expectations as ge from fastapi.encoders import jsonable_encoder -from app.core.config import APP_DIR, GeographySettings, Settings +from app.core.config import GeographySettings, Settings from app.utils.column_mapping import find_geography_columns from app.utils.common import ( modify_values_to_be_in_set, read_dataset, - read_pandas_dataset, ) settings = Settings() @@ -19,7 +18,7 @@ async def modify_city_expectation_suite(column_name: str, result_format: str): default_expectation_suite = geograhy_setting.STATE_EXPECTATION - city_dataset = await read_pandas_dataset(APP_DIR / "core" / "district.csv") + city_dataset = standard_data_values[["district"]].dropna() city_list = city_dataset["districts"].tolist() changed_config = { @@ -65,7 +64,7 @@ async def city_expectation_suite(dataset, result_format): async def modify_state_expectation_suite(column_name: str, result_format: str): default_expectation_suite = geograhy_setting.STATE_EXPECTATION - state_dataset = await read_pandas_dataset(APP_DIR / "core" / "state.csv") + state_dataset = standard_data_values[["state"]].dropna() state_list = state_dataset["state"].tolist() changed_config = { @@ -112,9 +111,7 @@ async def modify_country_expectation_suite( ): default_expectation_suite = geograhy_setting.COUNTRY_EXPECTATION - country_dataset = await read_pandas_dataset( - APP_DIR / "core" / "country.csv" - ) + country_dataset = standard_data_values[["country"]].dropna() country_list = country_dataset["country"].tolist() changed_config = { diff --git a/app/utils/insurance.py b/app/utils/insurance.py index c78dad9..93d64d3 100644 --- a/app/utils/insurance.py +++ b/app/utils/insurance.py @@ -1,9 +1,9 @@ import great_expectations as ge from fastapi.encoders import jsonable_encoder - -from app.core.config import APP_DIR, InsuranceCompanySettings, Settings +from app.api.api_v1.routers.dictionary import standard_data_values +from app.core.config import InsuranceCompanySettings, Settings from app.utils.column_mapping import find_insurance_company_columns -from app.utils.common import modify_values_to_be_in_set, read_pandas_dataset +from app.utils.common import modify_values_to_be_in_set settings = Settings() insurance_company_settings = InsuranceCompanySettings() @@ -16,9 +16,7 @@ async def modify_insurance_company_name_expectation_suite( insurance_company_settings.INSURANCE_COMPANY_NAME_EXPECTATION ) - insurance_company_names_dataset = await read_pandas_dataset( - APP_DIR / "core" / "insurance_companies.csv" - ) + insurance_company_names_dataset = standard_data_values[["insurance_companies"]] insurance_company_names_list = insurance_company_names_dataset[ "insurance_companies" ].tolist() diff --git a/app/utils/metadata.py b/app/utils/metadata.py index c9699be..1c0745c 100644 --- a/app/utils/metadata.py +++ b/app/utils/metadata.py @@ -4,20 +4,22 @@ import great_expectations as ge from fastapi.encoders import jsonable_encoder -from app.core.config import APP_DIR, MetadataSettings, Settings +from app.core.config import MetadataSettings, Settings from app.utils.column_mapping import find_metadata_columns from app.utils.common import ( modify_values_to_be_in_set, modify_values_to_match_regex_list, read_dataset, - read_pandas_dataset, ) +from app.api.api_v1.routers.dictionary import standard_data_values from app.utils.general import general_metadata_expectation_suite from app.utils.tags import tags_expectation_suite from app.utils.unit import unit_expectation_suite settings = Settings() meta_data_setting = MetadataSettings() +# todo: in future if we need short_form values from dictionary uncomment the following +# short_form_dataset = standard_data_values[["short_form"]].dropna() async def modify_sector_expectation_suite( @@ -26,7 +28,7 @@ async def modify_sector_expectation_suite( default_expectation_suite = meta_data_setting.SECTOR_EXPECTATION - sector_dataset = await read_pandas_dataset(APP_DIR / "core" / "sector.csv") + sector_dataset = standard_data_values[["sector"]].dropna() sector_list = sector_dataset["sector"].tolist() changed_config = { @@ -86,9 +88,7 @@ async def modify_organization_expectation_suite( ): default_expectation_suite = meta_data_setting.ORGANIZATION_EXPECTATION - organization_dataset = await read_pandas_dataset( - APP_DIR / "core" / "organization.csv" - ) + organization_dataset = standard_data_values[["organization"]].dropna() organization_list = organization_dataset["organization"].tolist() changed_config = { @@ -148,10 +148,9 @@ async def modify_short_form_expectation_suite( ): default_expectation_suite = meta_data_setting.SHORT_FORM_EXPECTATION - short_form_dataset = await read_pandas_dataset( - APP_DIR / "core" / "short_form.csv" - ) - short_form_list = short_form_dataset["short_form"].tolist() + # NOTE: Modify the short_form_expectation_suite to use short_form + short_form_dataset = {"short_form": ""} + short_form_list = short_form_dataset["short_form"] changed_config = { "expect_column_values_to_be_in_set": { @@ -210,9 +209,7 @@ async def modify_frequency_of_update_expectation_suite( meta_data_setting.FREQUENCY_OF_UPDATE_EXPECTATION ) - frequency_of_update_dataset = await read_pandas_dataset( - APP_DIR / "core" / "frequency_of_update.csv" - ) + frequency_of_update_dataset = standard_data_values[["frequency_of_update"]].dropna() frequency_of_update_list = frequency_of_update_dataset[ "frequency_of_update" ].tolist()