Merge pull request #19 from dsih-artpark/anuja-working

added functions for preprocessing and standardisation
dsih-artpark · Apr 18, 2024 · 00c5705 · 00c5705
2 parents 5b7c845 + 7fdd330
commit 00c5705
Show file tree

Hide file tree

Showing 9 changed files with 116 additions and 36 deletions.
diff --git a/src/epipipeline/preprocess/dengue/functions/map_columns.py b/src/epipipeline/preprocess/dengue/functions/map_columns.py
@@ -0,0 +1,21 @@
+import re
+
+def map_columns(colname:str, map_dict: dict) -> str: 
+    """This function standardises column names using mapping in config file
+
+    Args:
+        colname (str): Current column in DataFrame
+        map (dict): Dictionary mapping of preprocessed col names to standardised col names
+
+    Returns:
+        str: Standardised column name
+    """
+
+    colname=re.sub(r"[^\w\s]","", colname.lstrip().rstrip().lower())
+    colname=re.sub(r"(\s+)"," ", colname)
+    colname=re.sub(r"\s","_", colname)
+
+    for key, values in map_dict.items():
+        if colname in values:
+            return key
+    return colname
diff --git a/...process/dengue/functions/set_dataframe.py → ...reprocess/dengue/functions/set_headers.py b/...process/dengue/functions/set_dataframe.py → ...reprocess/dengue/functions/set_headers.py
diff --git a/...dengue/functions/workbook_split_NVBDCP.py → ...dengue/functions/split_workbook_NVBDCP.py b/...dengue/functions/workbook_split_NVBDCP.py → ...dengue/functions/split_workbook_NVBDCP.py
@@ -3,7 +3,7 @@
 import re
 import datetime
 
-def split_workbook(workbook_name:str) -> bool:
+def split_workbook_NVBDCP(workbook_name:str) -> bool:
     """This function splits the NVBDCP Raw Line list into individual csvs by source.
 
     Args:

diff --git a/src/epipipeline/standardise/demographics.py b/src/epipipeline/standardise/demographics.py
@@ -28,6 +28,49 @@ def standardise_age(age):
     else:
         return np.nan
 
+def standardise_age2(age) -> float:
+    """Converts mixed age entries to a standard float 
+
+    Args:
+        age (str/float/int): age specified in the raw dataset
+
+    Returns:
+        float: standardised age 
+    """
+    if isinstance(age, str):
+        pattern = r'^(\d+\.?\d*) *([ym]?[ |.|,|-]?.*)?$'
+        match = re.search(pattern, age)
+        if match:
+            if match.group(1):
+                if re.match(r'^\d{1,3}', match.group(1)):
+                    age = float(match.group(1))
+                else:
+                    return np.nan
+            else:
+                return np.nan
+            if match.group(2):
+                if re.match('^[m|M].*', match.group(2)):
+                    if age<13:
+                        return round(age / 12, 2)
+                    else:
+                        return age
+                elif re.match(r'^[y|Y]\D*\d{1,2}[m|M]', match.group(2)):
+                    month_match=re.match(r'^[y|Y]\D*(\d{1,2})[m|M]', match.group(2))
+                    if month_match.group(1):
+                        month=round(float(month_match.group(1))/ 12, 2)
+                        age+=month
+                        return age
+                else:
+                    return age
+            return age
+        else:
+            return np.nan
+    elif isinstance(age, int):
+        return float(age)
+    elif isinstance(age,float):
+        return age
+    else:
+        return np.nan
 
 def standardise_gender(gender):
     standard_genders = ['MALE', 'FEMALE']
@@ -42,3 +85,24 @@ def standardise_gender(gender):
     best_match = max(matches, key=lambda x: x[1])
 
     return best_match[0]
+
+
+def standardise_gender2(gender:str)->str:
+    """Converts mixed gender entries to a standard format
+
+    Args:
+        gender (str): gender entries in the raw dataset
+
+    Returns:
+        str: standardised gender (FEMALE, MALE, UNKNOWN)
+    """
+    gender = str(gender).upper().lstrip().rstrip()
+
+    if re.search(r'[fwgFWG]', gender):
+        gender="FEMALE"
+    elif re.search(r'^[mbMB]', gender):
+        gender='MALE'
+    else:
+        return 'UNKNOWN'
+
+    return gender
diff --git a/...dardise/dengue/functions/fix_two_dates.py → ...andardise/dengue/functions/FixTwoDates.py b/...dardise/dengue/functions/fix_two_dates.py → ...andardise/dengue/functions/FixTwoDates.py
@@ -1,7 +1,7 @@
 import pandas as pd
 import datetime
 
-def date_fix(df: pd.Series) -> pd.Series:
+def FixTwoDates(resultDate: datetime, sampleDate:datetime) -> pd.Series:
     """_summary_: Attempts to fix logical inconsistency in dates - Sample date > Result date
 
     Args:
@@ -11,8 +11,8 @@ def date_fix(df: pd.Series) -> pd.Series:
     Returns:
         _type_: If logical errors can be fixed, returns updated date(s). Else, returns original dates.
     """
-    resultDate=df[resultDate]
-    sampleDate=df[sampleDate]
+
+    isinstance(resultDate, datetime) and isinstance(sampleDate, datetime), "Format the dates before applying this function"
     delta=resultDate-sampleDate
 
     if (pd.Timedelta(60, "d") < delta ) | (delta < pd.Timedelta(0, "d")):

diff --git a/src/epipipeline/standardise/dengue/functions/assign_uuid.py b/src/epipipeline/standardise/dengue/functions/assign_uuid.py
diff --git a/.../functions/extract_mobile_from_address.py → ...rdise/dengue/functions/extract_contact.py b/.../functions/extract_mobile_from_address.py → ...rdise/dengue/functions/extract_contact.py
@@ -1,14 +1,15 @@
 import pandas as pd
 import re
+import numpy as np
 
-def extract_mobile_from_address(address: str):
+def extract_contact(address: str) -> tuple:
     """This function extracts mobile number from the address/name fields and strips the name/address from the mobile number field
 
     Args:
         address (str): Name & Address in KA, and Address in PMC, PCMC, pune Rural
 
     Returns:
-        pd.Series: DataFrame series of address & mobile number
+        tuple: DataFrame series of address & mobile number
     """
     assert isinstance(address, str), "Invalid input"
 
@@ -18,6 +19,6 @@ def extract_mobile_from_address(address: str):
         mobile_number=mobile_present.group(1)
         address=re.sub(r"9?1?\d{10}","", address)
     else:
-        mobile_number=pd.NA
+        mobile_number=np.nan
 
-    return pd.Series(address, mobile_number)
+    return (address, mobile_number)
diff --git a/src/epipipeline/standardise/dengue/functions/geocode.py b/src/epipipeline/standardise/dengue/functions/geocode.py
@@ -2,6 +2,7 @@
 from googlemaps import Client as GoogleMaps
 import googlemaps
 import gmaps
+import numpy as np
 
 # 2 ways to retrieve API key - env file or local encryption
 # Method 1 - Run this if you have set the API key in a .env file
@@ -31,7 +32,7 @@
 
 # Geocoding function
 
-def geocode(full_address: str, MyAPI: str) -> tuple:
+def Geocode(full_address: str, MyAPI: str) -> tuple:
     """_summary_
 
     Args:
@@ -52,11 +53,4 @@ def geocode(full_address: str, MyAPI: str) -> tuple:
             raise Exception("geocoding failed")
     except Exception as e:
         print(f"Unable to geocode {full_address}")
-    return None,None
-
-# run function
-
-# Apply function
-# df["location.geometry.latitude.imputed"], df["location.geometry.longitude.imputed"] = zip(df["full_address"].apply(lambda x: geocode(x, MyAPI)))
-
-
+    return np.nan,np.nan
diff --git a/src/epipipeline/standardise/dengue/functions/standardise_result.py b/src/epipipeline/standardise/dengue/functions/standardise_result.py
@@ -0,0 +1,19 @@
+import re
+import numpy as np
+
+def standardise_result(x) -> str:
+    """This function standardises results to positive or negative
+
+    Args:
+        x (str/int): Result in the raw dataset
+
+    Returns:
+        str: Negative, Positive or NaN
+    """
+    if isinstance(x, str) or isinstance(x, int):
+        if re.search(r"-ve|Neg|Negative|No|0", str(x), re.IGNORECASE):
+            return "NEGATIVE"
+        elif re.search(r"NS1|IgM|D|Yes|\+ve|Pos|Positive|1", str(x), re.IGNORECASE):
+            return "POSITIVE"
+    return np.nan
+