From 069b9989bfeefcd85852b02271f69f06a1746f4a Mon Sep 17 00:00:00 2001 From: anujavenkatachalam04 Date: Sat, 6 Apr 2024 01:01:26 +0530 Subject: [PATCH 1/9] added functions for preprocessing and standardisation --- .../preprocess/dengue/functions/MapColumns.py | 15 +++++++++++++ .../{set_dataframe.py => SetHeaders.py} | 6 +++--- ...split_NVBDCP.py => SplitWorkbookNVBDCP.py} | 2 +- .../{assign_uuid.py => AssignUUID.py} | 2 +- ...address.py => ExtractMobileFromAddress.py} | 2 +- .../{fix_two_dates.py => FixTwoDates.py} | 6 +++--- .../standardise/dengue/functions/NumvarStd.py | 21 +++++++++++++++++++ .../standardise/dengue/functions/geocode.py | 12 +++-------- 8 files changed, 48 insertions(+), 18 deletions(-) create mode 100644 src/epipipeline/preprocess/dengue/functions/MapColumns.py rename src/epipipeline/preprocess/dengue/functions/{set_dataframe.py => SetHeaders.py} (90%) rename src/epipipeline/preprocess/dengue/functions/{workbook_split_NVBDCP.py => SplitWorkbookNVBDCP.py} (97%) rename src/epipipeline/standardise/dengue/functions/{assign_uuid.py => AssignUUID.py} (93%) rename src/epipipeline/standardise/dengue/functions/{extract_mobile_from_address.py => ExtractMobileFromAddress.py} (93%) rename src/epipipeline/standardise/dengue/functions/{fix_two_dates.py => FixTwoDates.py} (94%) create mode 100644 src/epipipeline/standardise/dengue/functions/NumvarStd.py diff --git a/src/epipipeline/preprocess/dengue/functions/MapColumns.py b/src/epipipeline/preprocess/dengue/functions/MapColumns.py new file mode 100644 index 0000000..b4f0d8d --- /dev/null +++ b/src/epipipeline/preprocess/dengue/functions/MapColumns.py @@ -0,0 +1,15 @@ +def MapColumns(colname:str, map_dict: dict) -> str: + """_summary_ + + Args: + colname (str): Current column in DataFrame + map (dict): Dictionary mapping of preprocessed col names to standardised col names + + Returns: + str: Standardised column name + """ + + for key, values in map_dict.items(): + if colname in values: + return key + return colname \ No newline at end of file diff --git a/src/epipipeline/preprocess/dengue/functions/set_dataframe.py b/src/epipipeline/preprocess/dengue/functions/SetHeaders.py similarity index 90% rename from src/epipipeline/preprocess/dengue/functions/set_dataframe.py rename to src/epipipeline/preprocess/dengue/functions/SetHeaders.py index 848eb02..cadf74e 100644 --- a/src/epipipeline/preprocess/dengue/functions/set_dataframe.py +++ b/src/epipipeline/preprocess/dengue/functions/SetHeaders.py @@ -2,7 +2,7 @@ import re -def search_header(L: list, pivot_col_name) -> bool: +def SearchHeader(L: list, pivot_col_name) -> bool: """This function identifies the header row in a dataframe Args: @@ -22,7 +22,7 @@ def search_header(L: list, pivot_col_name) -> bool: break return header_search -def set_headers(df: pd.DataFrame, pivot_column: str, col_start_index: int, col_start_value): +def SetHeaders(df: pd.DataFrame, pivot_column: str, col_start_index: int, col_start_value): """_summary_ Args: @@ -35,7 +35,7 @@ def set_headers(df: pd.DataFrame, pivot_column: str, col_start_index: int, col_s # sets the dataframe's row start i=0 - while search_header(list(df.columns), pivot_column) and (i<6): # change pivot column here, if needed + while SearchHeader(list(df.columns), pivot_column) and (i<6): # change pivot column here, if needed df.columns=df.iloc[i,:] i+=1 df.drop(axis=0, index=[n for n in range(i)], inplace=True) diff --git a/src/epipipeline/preprocess/dengue/functions/workbook_split_NVBDCP.py b/src/epipipeline/preprocess/dengue/functions/SplitWorkbookNVBDCP.py similarity index 97% rename from src/epipipeline/preprocess/dengue/functions/workbook_split_NVBDCP.py rename to src/epipipeline/preprocess/dengue/functions/SplitWorkbookNVBDCP.py index 7efa475..95bd2f4 100644 --- a/src/epipipeline/preprocess/dengue/functions/workbook_split_NVBDCP.py +++ b/src/epipipeline/preprocess/dengue/functions/SplitWorkbookNVBDCP.py @@ -3,7 +3,7 @@ import re import datetime -def split_workbook(workbook_name:str) -> bool: +def SplitWorkbookNVBDCP(workbook_name:str) -> bool: """This function splits the NVBDCP Raw Line list into individual csvs by source. Args: diff --git a/src/epipipeline/standardise/dengue/functions/assign_uuid.py b/src/epipipeline/standardise/dengue/functions/AssignUUID.py similarity index 93% rename from src/epipipeline/standardise/dengue/functions/assign_uuid.py rename to src/epipipeline/standardise/dengue/functions/AssignUUID.py index a76b2ff..381ac8a 100644 --- a/src/epipipeline/standardise/dengue/functions/assign_uuid.py +++ b/src/epipipeline/standardise/dengue/functions/AssignUUID.py @@ -1,7 +1,7 @@ import pandas as pd import uuid -def assign_uuid(n: int): +def AssignUUID(n: int): """_summary_ Args: diff --git a/src/epipipeline/standardise/dengue/functions/extract_mobile_from_address.py b/src/epipipeline/standardise/dengue/functions/ExtractMobileFromAddress.py similarity index 93% rename from src/epipipeline/standardise/dengue/functions/extract_mobile_from_address.py rename to src/epipipeline/standardise/dengue/functions/ExtractMobileFromAddress.py index 38ba22a..6ac652a 100644 --- a/src/epipipeline/standardise/dengue/functions/extract_mobile_from_address.py +++ b/src/epipipeline/standardise/dengue/functions/ExtractMobileFromAddress.py @@ -1,7 +1,7 @@ import pandas as pd import re -def extract_mobile_from_address(address: str): +def ExtractMobileFromAddress(address: str): """This function extracts mobile number from the address/name fields and strips the name/address from the mobile number field Args: diff --git a/src/epipipeline/standardise/dengue/functions/fix_two_dates.py b/src/epipipeline/standardise/dengue/functions/FixTwoDates.py similarity index 94% rename from src/epipipeline/standardise/dengue/functions/fix_two_dates.py rename to src/epipipeline/standardise/dengue/functions/FixTwoDates.py index a45cc99..cf80f80 100644 --- a/src/epipipeline/standardise/dengue/functions/fix_two_dates.py +++ b/src/epipipeline/standardise/dengue/functions/FixTwoDates.py @@ -1,7 +1,7 @@ import pandas as pd import datetime -def date_fix(df: pd.Series) -> pd.Series: +def FixTwoDates(resultDate: datetime, sampleDate:datetime) -> pd.Series: """_summary_: Attempts to fix logical inconsistency in dates - Sample date > Result date Args: @@ -11,8 +11,8 @@ def date_fix(df: pd.Series) -> pd.Series: Returns: _type_: If logical errors can be fixed, returns updated date(s). Else, returns original dates. """ - resultDate=df[resultDate] - sampleDate=df[sampleDate] + + isinstance(resultDate, datetime) and isinstance(sampleDate, datetime), "Format the dates before applying this function" delta=resultDate-sampleDate if (pd.Timedelta(60, "d") < delta ) | (delta < pd.Timedelta(0, "d")): diff --git a/src/epipipeline/standardise/dengue/functions/NumvarStd.py b/src/epipipeline/standardise/dengue/functions/NumvarStd.py new file mode 100644 index 0000000..db4eb15 --- /dev/null +++ b/src/epipipeline/standardise/dengue/functions/NumvarStd.py @@ -0,0 +1,21 @@ +# standardise dtypes +import re +import numpy as np + +def NumvarStd(x): + """_summary_ + + Args: + x (_type_): string/object variable + + Returns: + _type_: numbers + """ + if re.search(r"[^\d]", str(x)): + res=re.search(r"\d+", str(x)) + if res: + return res.group(0) + else: + return np.nan + else: + return x \ No newline at end of file diff --git a/src/epipipeline/standardise/dengue/functions/geocode.py b/src/epipipeline/standardise/dengue/functions/geocode.py index 18398fa..21c66d9 100644 --- a/src/epipipeline/standardise/dengue/functions/geocode.py +++ b/src/epipipeline/standardise/dengue/functions/geocode.py @@ -2,6 +2,7 @@ from googlemaps import Client as GoogleMaps import googlemaps import gmaps +import numpy as np # 2 ways to retrieve API key - env file or local encryption # Method 1 - Run this if you have set the API key in a .env file @@ -31,7 +32,7 @@ # Geocoding function -def geocode(full_address: str, MyAPI: str) -> tuple: +def Geocode(full_address: str, MyAPI: str) -> tuple: """_summary_ Args: @@ -52,11 +53,4 @@ def geocode(full_address: str, MyAPI: str) -> tuple: raise Exception("geocoding failed") except Exception as e: print(f"Unable to geocode {full_address}") - return None,None - -# run function - -# Apply function -# df["location.geometry.latitude.imputed"], df["location.geometry.longitude.imputed"] = zip(df["full_address"].apply(lambda x: geocode(x, MyAPI))) - - + return np.nan,np.nan From c5b5b7236d059d9dc27768b12bc0ca478ebf79e0 Mon Sep 17 00:00:00 2001 From: anujavenkatachalam04 Date: Sat, 13 Apr 2024 13:11:54 +0530 Subject: [PATCH 2/9] MINOR: standardise_age2 --- src/epipipeline/standardise/demographics.py | 35 +++++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/src/epipipeline/standardise/demographics.py b/src/epipipeline/standardise/demographics.py index 0ea70eb..12a3649 100644 --- a/src/epipipeline/standardise/demographics.py +++ b/src/epipipeline/standardise/demographics.py @@ -28,6 +28,41 @@ def standardise_age(age): else: return np.nan +def standardise_age2(age): + if isinstance(age, str): + pattern = r'^(\d+\.?\d*) *([ym]?[ |.|,|-]?.*)?$' + match = re.search(pattern, age) + if match: + if match.group(1): + if re.match(r'^\d{1,3}', match.group(1)): + age = float(match.group(1)) + else: + return np.nan + else: + return np.nan + if match.group(2): + if re.match('^[m|M].*', match.group(2)): + if age<13: + return round(age / 12, 2) + else: + return age + elif re.match(r'^[y|Y]\D*\d{1,2}[m|M]', match.group(2)): + month_match=re.match(r'^[y|Y]\D*(\d{1,2})[m|M]', match.group(2)) + if month_match.group(1): + month=round(float(month_match.group(1))/ 12, 2) + age+=month + return age + else: + return age + return age + else: + return np.nan + elif isinstance(age, int): + return float(age) + elif isinstance(age,float): + return age + else: + return np.nan def standardise_gender(gender): standard_genders = ['MALE', 'FEMALE'] From 12036b3bb0fbc70d91571f7738b1c7d75822e69b Mon Sep 17 00:00:00 2001 From: anujavenkatachalam04 Date: Sat, 13 Apr 2024 15:33:04 +0530 Subject: [PATCH 3/9] MINOR: standardise_gender2 --- src/epipipeline/standardise/demographics.py | 31 ++++++++++++++++++++- 1 file changed, 30 insertions(+), 1 deletion(-) diff --git a/src/epipipeline/standardise/demographics.py b/src/epipipeline/standardise/demographics.py index 12a3649..1ecf855 100644 --- a/src/epipipeline/standardise/demographics.py +++ b/src/epipipeline/standardise/demographics.py @@ -28,7 +28,15 @@ def standardise_age(age): else: return np.nan -def standardise_age2(age): +def standardise_age2(age) -> float: + """Converts mixed age entries to a standard float + + Args: + age (str/float/int): age specified in the raw dataset + + Returns: + float: standardised age + """ if isinstance(age, str): pattern = r'^(\d+\.?\d*) *([ym]?[ |.|,|-]?.*)?$' match = re.search(pattern, age) @@ -77,3 +85,24 @@ def standardise_gender(gender): best_match = max(matches, key=lambda x: x[1]) return best_match[0] + + +def standardise_gender2(gender:str)->str: + """Converts mixed gender entries to a standard format + + Args: + gender (str): gender entries in the raw dataset + + Returns: + str: standardised gender (FEMALE, MALE, UNKNOWN) + """ + gender = str(gender).upper().lstrip().rstrip() + + if re.search(r'[fwgFWG]', gender): + gender="FEMALE" + elif re.search(r'^[mbMB]', gender): + gender='MALE' + else: + return 'UNKNOWN' + + return gender \ No newline at end of file From 3bca07ccfd5e3f970447a44ad0b4f0e833021eec Mon Sep 17 00:00:00 2001 From: anujavenkatachalam04 Date: Sat, 13 Apr 2024 19:00:31 +0530 Subject: [PATCH 4/9] PATCH: edited function to return tuple --- .../dengue/functions/ExtractMobileFromAddress.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/epipipeline/standardise/dengue/functions/ExtractMobileFromAddress.py b/src/epipipeline/standardise/dengue/functions/ExtractMobileFromAddress.py index 6ac652a..454c956 100644 --- a/src/epipipeline/standardise/dengue/functions/ExtractMobileFromAddress.py +++ b/src/epipipeline/standardise/dengue/functions/ExtractMobileFromAddress.py @@ -1,14 +1,14 @@ import pandas as pd import re -def ExtractMobileFromAddress(address: str): +def extract_mobile_from_address(address: str) -> tuple: """This function extracts mobile number from the address/name fields and strips the name/address from the mobile number field Args: address (str): Name & Address in KA, and Address in PMC, PCMC, pune Rural Returns: - pd.Series: DataFrame series of address & mobile number + tuple: DataFrame series of address & mobile number """ assert isinstance(address, str), "Invalid input" @@ -20,4 +20,4 @@ def ExtractMobileFromAddress(address: str): else: mobile_number=pd.NA - return pd.Series(address, mobile_number) + return (address, mobile_number) From 84b146f6a1665c81851d3b23ea51de3b696b54b0 Mon Sep 17 00:00:00 2001 From: anujavenkatachalam04 Date: Sat, 13 Apr 2024 19:01:48 +0530 Subject: [PATCH 5/9] PATCH: edited null value return for extract mobile number function --- .../standardise/dengue/functions/ExtractMobileFromAddress.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/epipipeline/standardise/dengue/functions/ExtractMobileFromAddress.py b/src/epipipeline/standardise/dengue/functions/ExtractMobileFromAddress.py index 454c956..281bfb7 100644 --- a/src/epipipeline/standardise/dengue/functions/ExtractMobileFromAddress.py +++ b/src/epipipeline/standardise/dengue/functions/ExtractMobileFromAddress.py @@ -1,5 +1,6 @@ import pandas as pd import re +import numpy as np def extract_mobile_from_address(address: str) -> tuple: """This function extracts mobile number from the address/name fields and strips the name/address from the mobile number field @@ -18,6 +19,6 @@ def extract_mobile_from_address(address: str) -> tuple: mobile_number=mobile_present.group(1) address=re.sub(r"9?1?\d{10}","", address) else: - mobile_number=pd.NA + mobile_number=np.nan return (address, mobile_number) From 36f6717ec778b5b03a989b49e5ad36747f2192c1 Mon Sep 17 00:00:00 2001 From: anujavenkatachalam04 Date: Sat, 13 Apr 2024 19:06:11 +0530 Subject: [PATCH 6/9] PATCH: added string vars to map_columns function --- .../preprocess/dengue/functions/MapColumns.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/src/epipipeline/preprocess/dengue/functions/MapColumns.py b/src/epipipeline/preprocess/dengue/functions/MapColumns.py index b4f0d8d..48fbae9 100644 --- a/src/epipipeline/preprocess/dengue/functions/MapColumns.py +++ b/src/epipipeline/preprocess/dengue/functions/MapColumns.py @@ -1,5 +1,7 @@ -def MapColumns(colname:str, map_dict: dict) -> str: - """_summary_ +import re + +def map_columns(colname:str, map_dict: dict) -> str: + """_summary_Standardises column names using mapping in config file Args: colname (str): Current column in DataFrame @@ -9,7 +11,11 @@ def MapColumns(colname:str, map_dict: dict) -> str: str: Standardised column name """ + colname=re.sub(r"[^\w\s]","", colname.lstrip().rstrip().lower()) + colname=re.sub(r"(\s+)"," ", colname) + colname=re.sub(r"\s","_", colname) + for key, values in map_dict.items(): if colname in values: return key - return colname \ No newline at end of file + return colname From c016a6003ca77ac37b91b7f144d9d049865c7551 Mon Sep 17 00:00:00 2001 From: anujavenkatachalam04 Date: Sat, 13 Apr 2024 19:16:12 +0530 Subject: [PATCH 7/9] PATCH: updated function names to camel case --- .../{MapColumns.py => map_columns.py} | 2 +- .../{SetHeaders.py => set_headers.py} | 6 +++--- ...bookNVBDCP.py => split_workbook_NVBDCP.py} | 2 +- .../dengue/functions/AssignUUID.py | 19 ----------------- .../standardise/dengue/functions/NumvarStd.py | 21 ------------------- ...ress.py => extract_mobile_from_address.py} | 2 +- 6 files changed, 6 insertions(+), 46 deletions(-) rename src/epipipeline/preprocess/dengue/functions/{MapColumns.py => map_columns.py} (87%) rename src/epipipeline/preprocess/dengue/functions/{SetHeaders.py => set_headers.py} (90%) rename src/epipipeline/preprocess/dengue/functions/{SplitWorkbookNVBDCP.py => split_workbook_NVBDCP.py} (97%) delete mode 100644 src/epipipeline/standardise/dengue/functions/AssignUUID.py delete mode 100644 src/epipipeline/standardise/dengue/functions/NumvarStd.py rename src/epipipeline/standardise/dengue/functions/{ExtractMobileFromAddress.py => extract_mobile_from_address.py} (92%) diff --git a/src/epipipeline/preprocess/dengue/functions/MapColumns.py b/src/epipipeline/preprocess/dengue/functions/map_columns.py similarity index 87% rename from src/epipipeline/preprocess/dengue/functions/MapColumns.py rename to src/epipipeline/preprocess/dengue/functions/map_columns.py index 48fbae9..37080fb 100644 --- a/src/epipipeline/preprocess/dengue/functions/MapColumns.py +++ b/src/epipipeline/preprocess/dengue/functions/map_columns.py @@ -1,7 +1,7 @@ import re def map_columns(colname:str, map_dict: dict) -> str: - """_summary_Standardises column names using mapping in config file + """This function standardises column names using mapping in config file Args: colname (str): Current column in DataFrame diff --git a/src/epipipeline/preprocess/dengue/functions/SetHeaders.py b/src/epipipeline/preprocess/dengue/functions/set_headers.py similarity index 90% rename from src/epipipeline/preprocess/dengue/functions/SetHeaders.py rename to src/epipipeline/preprocess/dengue/functions/set_headers.py index cadf74e..848eb02 100644 --- a/src/epipipeline/preprocess/dengue/functions/SetHeaders.py +++ b/src/epipipeline/preprocess/dengue/functions/set_headers.py @@ -2,7 +2,7 @@ import re -def SearchHeader(L: list, pivot_col_name) -> bool: +def search_header(L: list, pivot_col_name) -> bool: """This function identifies the header row in a dataframe Args: @@ -22,7 +22,7 @@ def SearchHeader(L: list, pivot_col_name) -> bool: break return header_search -def SetHeaders(df: pd.DataFrame, pivot_column: str, col_start_index: int, col_start_value): +def set_headers(df: pd.DataFrame, pivot_column: str, col_start_index: int, col_start_value): """_summary_ Args: @@ -35,7 +35,7 @@ def SetHeaders(df: pd.DataFrame, pivot_column: str, col_start_index: int, col_st # sets the dataframe's row start i=0 - while SearchHeader(list(df.columns), pivot_column) and (i<6): # change pivot column here, if needed + while search_header(list(df.columns), pivot_column) and (i<6): # change pivot column here, if needed df.columns=df.iloc[i,:] i+=1 df.drop(axis=0, index=[n for n in range(i)], inplace=True) diff --git a/src/epipipeline/preprocess/dengue/functions/SplitWorkbookNVBDCP.py b/src/epipipeline/preprocess/dengue/functions/split_workbook_NVBDCP.py similarity index 97% rename from src/epipipeline/preprocess/dengue/functions/SplitWorkbookNVBDCP.py rename to src/epipipeline/preprocess/dengue/functions/split_workbook_NVBDCP.py index 95bd2f4..4b94849 100644 --- a/src/epipipeline/preprocess/dengue/functions/SplitWorkbookNVBDCP.py +++ b/src/epipipeline/preprocess/dengue/functions/split_workbook_NVBDCP.py @@ -3,7 +3,7 @@ import re import datetime -def SplitWorkbookNVBDCP(workbook_name:str) -> bool: +def split_workbook_NVBDCP(workbook_name:str) -> bool: """This function splits the NVBDCP Raw Line list into individual csvs by source. Args: diff --git a/src/epipipeline/standardise/dengue/functions/AssignUUID.py b/src/epipipeline/standardise/dengue/functions/AssignUUID.py deleted file mode 100644 index 381ac8a..0000000 --- a/src/epipipeline/standardise/dengue/functions/AssignUUID.py +++ /dev/null @@ -1,19 +0,0 @@ -import pandas as pd -import uuid - -def AssignUUID(n: int): - """_summary_ - - Args: - n (int): length of dataset/number of uuuids to be generated - - Returns: - pd.Series: series of uuid4 of length n; named metadata.recordID - """ - assert isinstance(n, int), "Invalid Input: Enter an integer" - - return pd.Series([uuid.uuid4() for i in range(n)], name="metadata.recordID") - - - - diff --git a/src/epipipeline/standardise/dengue/functions/NumvarStd.py b/src/epipipeline/standardise/dengue/functions/NumvarStd.py deleted file mode 100644 index db4eb15..0000000 --- a/src/epipipeline/standardise/dengue/functions/NumvarStd.py +++ /dev/null @@ -1,21 +0,0 @@ -# standardise dtypes -import re -import numpy as np - -def NumvarStd(x): - """_summary_ - - Args: - x (_type_): string/object variable - - Returns: - _type_: numbers - """ - if re.search(r"[^\d]", str(x)): - res=re.search(r"\d+", str(x)) - if res: - return res.group(0) - else: - return np.nan - else: - return x \ No newline at end of file diff --git a/src/epipipeline/standardise/dengue/functions/ExtractMobileFromAddress.py b/src/epipipeline/standardise/dengue/functions/extract_mobile_from_address.py similarity index 92% rename from src/epipipeline/standardise/dengue/functions/ExtractMobileFromAddress.py rename to src/epipipeline/standardise/dengue/functions/extract_mobile_from_address.py index 281bfb7..a4338e9 100644 --- a/src/epipipeline/standardise/dengue/functions/ExtractMobileFromAddress.py +++ b/src/epipipeline/standardise/dengue/functions/extract_mobile_from_address.py @@ -2,7 +2,7 @@ import re import numpy as np -def extract_mobile_from_address(address: str) -> tuple: +def extract_contact(address: str) -> tuple: """This function extracts mobile number from the address/name fields and strips the name/address from the mobile number field Args: From e18fe81f27b095c932d6e3edd1431c0efd2dc68a Mon Sep 17 00:00:00 2001 From: anujavenkatachalam04 Date: Sat, 13 Apr 2024 19:17:47 +0530 Subject: [PATCH 8/9] PATCH: renamed function --- .../{extract_mobile_from_address.py => extract_contact.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename src/epipipeline/standardise/dengue/functions/{extract_mobile_from_address.py => extract_contact.py} (100%) diff --git a/src/epipipeline/standardise/dengue/functions/extract_mobile_from_address.py b/src/epipipeline/standardise/dengue/functions/extract_contact.py similarity index 100% rename from src/epipipeline/standardise/dengue/functions/extract_mobile_from_address.py rename to src/epipipeline/standardise/dengue/functions/extract_contact.py From 7fdd3303dc9ce46fc3141ee1ebdc1d1bc1174fcc Mon Sep 17 00:00:00 2001 From: anujavenkatachalam04 Date: Sat, 13 Apr 2024 19:20:58 +0530 Subject: [PATCH 9/9] MINOR: added standardise_result function --- .../dengue/functions/standardise_result.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) create mode 100644 src/epipipeline/standardise/dengue/functions/standardise_result.py diff --git a/src/epipipeline/standardise/dengue/functions/standardise_result.py b/src/epipipeline/standardise/dengue/functions/standardise_result.py new file mode 100644 index 0000000..0b9ce05 --- /dev/null +++ b/src/epipipeline/standardise/dengue/functions/standardise_result.py @@ -0,0 +1,19 @@ +import re +import numpy as np + +def standardise_result(x) -> str: + """This function standardises results to positive or negative + + Args: + x (str/int): Result in the raw dataset + + Returns: + str: Negative, Positive or NaN + """ + if isinstance(x, str) or isinstance(x, int): + if re.search(r"-ve|Neg|Negative|No|0", str(x), re.IGNORECASE): + return "NEGATIVE" + elif re.search(r"NS1|IgM|D|Yes|\+ve|Pos|Positive|1", str(x), re.IGNORECASE): + return "POSITIVE" + return np.nan +