Skip to content

Commit

Permalink
Modified the way of reading standard dataset
Browse files Browse the repository at this point in the history
  • Loading branch information
venu-sambarapu-DS committed Jun 4, 2024
1 parent 4e6455b commit e384308
Show file tree
Hide file tree
Showing 6 changed files with 47 additions and 52 deletions.
25 changes: 15 additions & 10 deletions app/api/api_v1/routers/dictionary.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,16 +18,20 @@
g_sheet_response = g_sheet_session.get("https://docs.google.com/spreadsheets/d/1NEsFJGr5IHsrIakGgeNFUvz5zpLOadh_vDH7Apqmv9E/gviz/tq?tqx=out:csv&sheet=master_dictionaries")
g_sheet_bytes_data = g_sheet_response.content
data = pd.read_csv(io.StringIO(g_sheet_bytes_data.decode('utf-8')))
print("reading data from google sheet@@@@")
# data.rename(
# columns={
# "country_standard_name": "country",
# "unique_standard_airline_name": "airline",
# "standard_disease_name": "disease",
# "psu_companies": "psu",
# }
# )
# print(data.columns.tolist())

standard_data_values = data.copy()
standard_data_values.rename(
columns={
"country_standard_name": "country",
"unique_standard_airline_name": "airline",
"standard_disease_name": "diseases",
"psu_companies": "psu",
"standard_district_name": "district",
"standard_states": "state",
"insurance_standard_names": "insurance_companies"
},
inplace=True,
)


@router.get("/", summary="Get all Saved Entities csv file name")
Expand All @@ -50,6 +54,7 @@ async def get_entity_data(entity: str):
json_compatible_item_data = jsonable_encoder(
entity_df.to_dict(orient="records")
)
print(json_compatible_item_data)
return JSONResponse(content=json_compatible_item_data)


Expand Down
12 changes: 5 additions & 7 deletions app/utils/airline.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
import great_expectations as ge
from fastapi.encoders import jsonable_encoder

from app.core.config import APP_DIR, AirlineSettings, Settings
from app.api.api_v1.routers.dictionary import standard_data_values
from app.core.config import AirlineSettings, Settings
from app.utils.column_mapping import find_airline_name_columns
from app.utils.common import modify_values_to_be_in_set, read_pandas_dataset
from app.utils.common import modify_values_to_be_in_set

settings = Settings()
airline_settings = AirlineSettings()
Expand All @@ -14,10 +14,8 @@ async def modify_airline_name_expectation_suite(
):
default_expectation_suite = airline_settings.AIRLINE_NAME_EXPECTATION

airline_names_dataset = await read_pandas_dataset(
APP_DIR / "core" / "airline_names.csv"
)
airline_names_list = airline_names_dataset["airline_names"].tolist()
airline_names_dataset = standard_data_values[["airline"]].dropna()
airline_names_list = airline_names_dataset["airline"].tolist()

changed_config = {
"expect_column_values_to_be_in_set": {
Expand Down
16 changes: 8 additions & 8 deletions app/utils/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,13 @@
import re
from io import BytesIO
from typing import Union

# from app.api.api_v1.routers.dictionary import data as dictionary_data
import great_expectations as ge
import pandas as pd
from charset_normalizer import from_bytes
from fastapi.logger import logger

from app.core.config import APP_DIR, GeographySettings
from app.core.config import GeographySettings

logging.basicConfig(level=logging.INFO)
geographic_settings = GeographySettings()
Expand Down Expand Up @@ -79,12 +79,12 @@ async def read_pandas_dataset(source: str, **kwargs):
return dataset


async def load_values_to_be_in_set(domain: str):
# this function is used to load csv files, consisting values
# for states or country that are required to be in specific set
set_values_file = APP_DIR / "core" / f"{domain}.csv"
set_values = pd.read_csv(set_values_file)[f"{domain}"].unique()
return set_values
# async def load_values_to_be_in_set(domain: str):
# # this function is used to load csv files, consisting values
# # for states or country that are required to be in specific set
# set_values_file = APP_DIR / "core" / f"{domain}.csv"
# set_values = pd.read_csv(set_values_file)[f"{domain}"].unique()
# return set_values


async def modify_column_names_to_expectation_suite(
Expand Down
13 changes: 5 additions & 8 deletions app/utils/geography.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,14 @@
import asyncio
from collections import ChainMap

from app.api.api_v1.routers.dictionary import standard_data_values
import great_expectations as ge
from fastapi.encoders import jsonable_encoder

from app.core.config import APP_DIR, GeographySettings, Settings
from app.core.config import GeographySettings, Settings
from app.utils.column_mapping import find_geography_columns
from app.utils.common import (
modify_values_to_be_in_set,
read_dataset,
read_pandas_dataset,
)

settings = Settings()
Expand All @@ -19,7 +18,7 @@
async def modify_city_expectation_suite(column_name: str, result_format: str):
default_expectation_suite = geograhy_setting.STATE_EXPECTATION

city_dataset = await read_pandas_dataset(APP_DIR / "core" / "district.csv")
city_dataset = standard_data_values[["district"]].dropna()
city_list = city_dataset["districts"].tolist()

changed_config = {
Expand Down Expand Up @@ -65,7 +64,7 @@ async def city_expectation_suite(dataset, result_format):
async def modify_state_expectation_suite(column_name: str, result_format: str):
default_expectation_suite = geograhy_setting.STATE_EXPECTATION

state_dataset = await read_pandas_dataset(APP_DIR / "core" / "state.csv")
state_dataset = standard_data_values[["state"]].dropna()
state_list = state_dataset["state"].tolist()

changed_config = {
Expand Down Expand Up @@ -112,9 +111,7 @@ async def modify_country_expectation_suite(
):
default_expectation_suite = geograhy_setting.COUNTRY_EXPECTATION

country_dataset = await read_pandas_dataset(
APP_DIR / "core" / "country.csv"
)
country_dataset = standard_data_values[["country"]].dropna()
country_list = country_dataset["country"].tolist()

changed_config = {
Expand Down
10 changes: 4 additions & 6 deletions app/utils/insurance.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
import great_expectations as ge
from fastapi.encoders import jsonable_encoder

from app.core.config import APP_DIR, InsuranceCompanySettings, Settings
from app.api.api_v1.routers.dictionary import standard_data_values
from app.core.config import InsuranceCompanySettings, Settings
from app.utils.column_mapping import find_insurance_company_columns
from app.utils.common import modify_values_to_be_in_set, read_pandas_dataset
from app.utils.common import modify_values_to_be_in_set

settings = Settings()
insurance_company_settings = InsuranceCompanySettings()
Expand All @@ -16,9 +16,7 @@ async def modify_insurance_company_name_expectation_suite(
insurance_company_settings.INSURANCE_COMPANY_NAME_EXPECTATION
)

insurance_company_names_dataset = await read_pandas_dataset(
APP_DIR / "core" / "insurance_companies.csv"
)
insurance_company_names_dataset = standard_data_values[["insurance_companies"]]
insurance_company_names_list = insurance_company_names_dataset[
"insurance_companies"
].tolist()
Expand Down
23 changes: 10 additions & 13 deletions app/utils/metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,20 +4,22 @@
import great_expectations as ge
from fastapi.encoders import jsonable_encoder

from app.core.config import APP_DIR, MetadataSettings, Settings
from app.core.config import MetadataSettings, Settings
from app.utils.column_mapping import find_metadata_columns
from app.utils.common import (
modify_values_to_be_in_set,
modify_values_to_match_regex_list,
read_dataset,
read_pandas_dataset,
)
from app.api.api_v1.routers.dictionary import standard_data_values
from app.utils.general import general_metadata_expectation_suite
from app.utils.tags import tags_expectation_suite
from app.utils.unit import unit_expectation_suite

settings = Settings()
meta_data_setting = MetadataSettings()
# todo: in future if we need short_form values from dictionary uncomment the following
# short_form_dataset = standard_data_values[["short_form"]].dropna()


async def modify_sector_expectation_suite(
Expand All @@ -26,7 +28,7 @@ async def modify_sector_expectation_suite(

default_expectation_suite = meta_data_setting.SECTOR_EXPECTATION

sector_dataset = await read_pandas_dataset(APP_DIR / "core" / "sector.csv")
sector_dataset = standard_data_values[["sector"]].dropna()
sector_list = sector_dataset["sector"].tolist()

changed_config = {
Expand Down Expand Up @@ -86,9 +88,7 @@ async def modify_organization_expectation_suite(
):
default_expectation_suite = meta_data_setting.ORGANIZATION_EXPECTATION

organization_dataset = await read_pandas_dataset(
APP_DIR / "core" / "organization.csv"
)
organization_dataset = standard_data_values[["organization"]].dropna()
organization_list = organization_dataset["organization"].tolist()

changed_config = {
Expand Down Expand Up @@ -148,10 +148,9 @@ async def modify_short_form_expectation_suite(
):
default_expectation_suite = meta_data_setting.SHORT_FORM_EXPECTATION

short_form_dataset = await read_pandas_dataset(
APP_DIR / "core" / "short_form.csv"
)
short_form_list = short_form_dataset["short_form"].tolist()
# NOTE: Modify the short_form_expectation_suite to use short_form
short_form_dataset = {"short_form": ""}
short_form_list = short_form_dataset["short_form"]

changed_config = {
"expect_column_values_to_be_in_set": {
Expand Down Expand Up @@ -210,9 +209,7 @@ async def modify_frequency_of_update_expectation_suite(
meta_data_setting.FREQUENCY_OF_UPDATE_EXPECTATION
)

frequency_of_update_dataset = await read_pandas_dataset(
APP_DIR / "core" / "frequency_of_update.csv"
)
frequency_of_update_dataset = standard_data_values[["frequency_of_update"]].dropna()
frequency_of_update_list = frequency_of_update_dataset[
"frequency_of_update"
].tolist()
Expand Down

0 comments on commit e384308

Please sign in to comment.