diff --git a/src/epipipeline/preprocess/dengue/functions/map_columns.py b/src/epipipeline/preprocess/dengue/functions/map_columns.py new file mode 100644 index 0000000..37080fb --- /dev/null +++ b/src/epipipeline/preprocess/dengue/functions/map_columns.py @@ -0,0 +1,21 @@ +import re + +def map_columns(colname:str, map_dict: dict) -> str: + """This function standardises column names using mapping in config file + + Args: + colname (str): Current column in DataFrame + map (dict): Dictionary mapping of preprocessed col names to standardised col names + + Returns: + str: Standardised column name + """ + + colname=re.sub(r"[^\w\s]","", colname.lstrip().rstrip().lower()) + colname=re.sub(r"(\s+)"," ", colname) + colname=re.sub(r"\s","_", colname) + + for key, values in map_dict.items(): + if colname in values: + return key + return colname diff --git a/src/epipipeline/preprocess/dengue/functions/set_dataframe.py b/src/epipipeline/preprocess/dengue/functions/set_headers.py similarity index 100% rename from src/epipipeline/preprocess/dengue/functions/set_dataframe.py rename to src/epipipeline/preprocess/dengue/functions/set_headers.py diff --git a/src/epipipeline/preprocess/dengue/functions/workbook_split_NVBDCP.py b/src/epipipeline/preprocess/dengue/functions/split_workbook_NVBDCP.py similarity index 97% rename from src/epipipeline/preprocess/dengue/functions/workbook_split_NVBDCP.py rename to src/epipipeline/preprocess/dengue/functions/split_workbook_NVBDCP.py index 7efa475..4b94849 100644 --- a/src/epipipeline/preprocess/dengue/functions/workbook_split_NVBDCP.py +++ b/src/epipipeline/preprocess/dengue/functions/split_workbook_NVBDCP.py @@ -3,7 +3,7 @@ import re import datetime -def split_workbook(workbook_name:str) -> bool: +def split_workbook_NVBDCP(workbook_name:str) -> bool: """This function splits the NVBDCP Raw Line list into individual csvs by source. Args: diff --git a/src/epipipeline/standardise/demographics.py b/src/epipipeline/standardise/demographics.py index 0ea70eb..1ecf855 100644 --- a/src/epipipeline/standardise/demographics.py +++ b/src/epipipeline/standardise/demographics.py @@ -28,6 +28,49 @@ def standardise_age(age): else: return np.nan +def standardise_age2(age) -> float: + """Converts mixed age entries to a standard float + + Args: + age (str/float/int): age specified in the raw dataset + + Returns: + float: standardised age + """ + if isinstance(age, str): + pattern = r'^(\d+\.?\d*) *([ym]?[ |.|,|-]?.*)?$' + match = re.search(pattern, age) + if match: + if match.group(1): + if re.match(r'^\d{1,3}', match.group(1)): + age = float(match.group(1)) + else: + return np.nan + else: + return np.nan + if match.group(2): + if re.match('^[m|M].*', match.group(2)): + if age<13: + return round(age / 12, 2) + else: + return age + elif re.match(r'^[y|Y]\D*\d{1,2}[m|M]', match.group(2)): + month_match=re.match(r'^[y|Y]\D*(\d{1,2})[m|M]', match.group(2)) + if month_match.group(1): + month=round(float(month_match.group(1))/ 12, 2) + age+=month + return age + else: + return age + return age + else: + return np.nan + elif isinstance(age, int): + return float(age) + elif isinstance(age,float): + return age + else: + return np.nan def standardise_gender(gender): standard_genders = ['MALE', 'FEMALE'] @@ -42,3 +85,24 @@ def standardise_gender(gender): best_match = max(matches, key=lambda x: x[1]) return best_match[0] + + +def standardise_gender2(gender:str)->str: + """Converts mixed gender entries to a standard format + + Args: + gender (str): gender entries in the raw dataset + + Returns: + str: standardised gender (FEMALE, MALE, UNKNOWN) + """ + gender = str(gender).upper().lstrip().rstrip() + + if re.search(r'[fwgFWG]', gender): + gender="FEMALE" + elif re.search(r'^[mbMB]', gender): + gender='MALE' + else: + return 'UNKNOWN' + + return gender \ No newline at end of file diff --git a/src/epipipeline/standardise/dengue/functions/fix_two_dates.py b/src/epipipeline/standardise/dengue/functions/FixTwoDates.py similarity index 94% rename from src/epipipeline/standardise/dengue/functions/fix_two_dates.py rename to src/epipipeline/standardise/dengue/functions/FixTwoDates.py index a45cc99..cf80f80 100644 --- a/src/epipipeline/standardise/dengue/functions/fix_two_dates.py +++ b/src/epipipeline/standardise/dengue/functions/FixTwoDates.py @@ -1,7 +1,7 @@ import pandas as pd import datetime -def date_fix(df: pd.Series) -> pd.Series: +def FixTwoDates(resultDate: datetime, sampleDate:datetime) -> pd.Series: """_summary_: Attempts to fix logical inconsistency in dates - Sample date > Result date Args: @@ -11,8 +11,8 @@ def date_fix(df: pd.Series) -> pd.Series: Returns: _type_: If logical errors can be fixed, returns updated date(s). Else, returns original dates. """ - resultDate=df[resultDate] - sampleDate=df[sampleDate] + + isinstance(resultDate, datetime) and isinstance(sampleDate, datetime), "Format the dates before applying this function" delta=resultDate-sampleDate if (pd.Timedelta(60, "d") < delta ) | (delta < pd.Timedelta(0, "d")): diff --git a/src/epipipeline/standardise/dengue/functions/assign_uuid.py b/src/epipipeline/standardise/dengue/functions/assign_uuid.py deleted file mode 100644 index a76b2ff..0000000 --- a/src/epipipeline/standardise/dengue/functions/assign_uuid.py +++ /dev/null @@ -1,19 +0,0 @@ -import pandas as pd -import uuid - -def assign_uuid(n: int): - """_summary_ - - Args: - n (int): length of dataset/number of uuuids to be generated - - Returns: - pd.Series: series of uuid4 of length n; named metadata.recordID - """ - assert isinstance(n, int), "Invalid Input: Enter an integer" - - return pd.Series([uuid.uuid4() for i in range(n)], name="metadata.recordID") - - - - diff --git a/src/epipipeline/standardise/dengue/functions/extract_mobile_from_address.py b/src/epipipeline/standardise/dengue/functions/extract_contact.py similarity index 73% rename from src/epipipeline/standardise/dengue/functions/extract_mobile_from_address.py rename to src/epipipeline/standardise/dengue/functions/extract_contact.py index 38ba22a..a4338e9 100644 --- a/src/epipipeline/standardise/dengue/functions/extract_mobile_from_address.py +++ b/src/epipipeline/standardise/dengue/functions/extract_contact.py @@ -1,14 +1,15 @@ import pandas as pd import re +import numpy as np -def extract_mobile_from_address(address: str): +def extract_contact(address: str) -> tuple: """This function extracts mobile number from the address/name fields and strips the name/address from the mobile number field Args: address (str): Name & Address in KA, and Address in PMC, PCMC, pune Rural Returns: - pd.Series: DataFrame series of address & mobile number + tuple: DataFrame series of address & mobile number """ assert isinstance(address, str), "Invalid input" @@ -18,6 +19,6 @@ def extract_mobile_from_address(address: str): mobile_number=mobile_present.group(1) address=re.sub(r"9?1?\d{10}","", address) else: - mobile_number=pd.NA + mobile_number=np.nan - return pd.Series(address, mobile_number) + return (address, mobile_number) diff --git a/src/epipipeline/standardise/dengue/functions/geocode.py b/src/epipipeline/standardise/dengue/functions/geocode.py index 18398fa..21c66d9 100644 --- a/src/epipipeline/standardise/dengue/functions/geocode.py +++ b/src/epipipeline/standardise/dengue/functions/geocode.py @@ -2,6 +2,7 @@ from googlemaps import Client as GoogleMaps import googlemaps import gmaps +import numpy as np # 2 ways to retrieve API key - env file or local encryption # Method 1 - Run this if you have set the API key in a .env file @@ -31,7 +32,7 @@ # Geocoding function -def geocode(full_address: str, MyAPI: str) -> tuple: +def Geocode(full_address: str, MyAPI: str) -> tuple: """_summary_ Args: @@ -52,11 +53,4 @@ def geocode(full_address: str, MyAPI: str) -> tuple: raise Exception("geocoding failed") except Exception as e: print(f"Unable to geocode {full_address}") - return None,None - -# run function - -# Apply function -# df["location.geometry.latitude.imputed"], df["location.geometry.longitude.imputed"] = zip(df["full_address"].apply(lambda x: geocode(x, MyAPI))) - - + return np.nan,np.nan diff --git a/src/epipipeline/standardise/dengue/functions/standardise_result.py b/src/epipipeline/standardise/dengue/functions/standardise_result.py new file mode 100644 index 0000000..0b9ce05 --- /dev/null +++ b/src/epipipeline/standardise/dengue/functions/standardise_result.py @@ -0,0 +1,19 @@ +import re +import numpy as np + +def standardise_result(x) -> str: + """This function standardises results to positive or negative + + Args: + x (str/int): Result in the raw dataset + + Returns: + str: Negative, Positive or NaN + """ + if isinstance(x, str) or isinstance(x, int): + if re.search(r"-ve|Neg|Negative|No|0", str(x), re.IGNORECASE): + return "NEGATIVE" + elif re.search(r"NS1|IgM|D|Yes|\+ve|Pos|Positive|1", str(x), re.IGNORECASE): + return "POSITIVE" + return np.nan +