Skip to content

Commit

Permalink
Merge pull request #19 from dsih-artpark/anuja-working
Browse files Browse the repository at this point in the history
added functions for preprocessing and standardisation
  • Loading branch information
anujavenkatachalam04 authored Apr 18, 2024
2 parents 5b7c845 + 7fdd330 commit 00c5705
Show file tree
Hide file tree
Showing 9 changed files with 116 additions and 36 deletions.
21 changes: 21 additions & 0 deletions src/epipipeline/preprocess/dengue/functions/map_columns.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
import re

def map_columns(colname:str, map_dict: dict) -> str:
"""This function standardises column names using mapping in config file
Args:
colname (str): Current column in DataFrame
map (dict): Dictionary mapping of preprocessed col names to standardised col names
Returns:
str: Standardised column name
"""

colname=re.sub(r"[^\w\s]","", colname.lstrip().rstrip().lower())
colname=re.sub(r"(\s+)"," ", colname)
colname=re.sub(r"\s","_", colname)

for key, values in map_dict.items():
if colname in values:
return key
return colname
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import re
import datetime

def split_workbook(workbook_name:str) -> bool:
def split_workbook_NVBDCP(workbook_name:str) -> bool:
"""This function splits the NVBDCP Raw Line list into individual csvs by source.
Args:
Expand Down
64 changes: 64 additions & 0 deletions src/epipipeline/standardise/demographics.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,49 @@ def standardise_age(age):
else:
return np.nan

def standardise_age2(age) -> float:
"""Converts mixed age entries to a standard float
Args:
age (str/float/int): age specified in the raw dataset
Returns:
float: standardised age
"""
if isinstance(age, str):
pattern = r'^(\d+\.?\d*) *([ym]?[ |.|,|-]?.*)?$'
match = re.search(pattern, age)
if match:
if match.group(1):
if re.match(r'^\d{1,3}', match.group(1)):
age = float(match.group(1))
else:
return np.nan
else:
return np.nan
if match.group(2):
if re.match('^[m|M].*', match.group(2)):
if age<13:
return round(age / 12, 2)
else:
return age
elif re.match(r'^[y|Y]\D*\d{1,2}[m|M]', match.group(2)):
month_match=re.match(r'^[y|Y]\D*(\d{1,2})[m|M]', match.group(2))
if month_match.group(1):
month=round(float(month_match.group(1))/ 12, 2)
age+=month
return age
else:
return age
return age
else:
return np.nan
elif isinstance(age, int):
return float(age)
elif isinstance(age,float):
return age
else:
return np.nan

def standardise_gender(gender):
standard_genders = ['MALE', 'FEMALE']
Expand All @@ -42,3 +85,24 @@ def standardise_gender(gender):
best_match = max(matches, key=lambda x: x[1])

return best_match[0]


def standardise_gender2(gender:str)->str:
"""Converts mixed gender entries to a standard format
Args:
gender (str): gender entries in the raw dataset
Returns:
str: standardised gender (FEMALE, MALE, UNKNOWN)
"""
gender = str(gender).upper().lstrip().rstrip()

if re.search(r'[fwgFWG]', gender):
gender="FEMALE"
elif re.search(r'^[mbMB]', gender):
gender='MALE'
else:
return 'UNKNOWN'

return gender
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import pandas as pd
import datetime

def date_fix(df: pd.Series) -> pd.Series:
def FixTwoDates(resultDate: datetime, sampleDate:datetime) -> pd.Series:
"""_summary_: Attempts to fix logical inconsistency in dates - Sample date > Result date
Args:
Expand All @@ -11,8 +11,8 @@ def date_fix(df: pd.Series) -> pd.Series:
Returns:
_type_: If logical errors can be fixed, returns updated date(s). Else, returns original dates.
"""
resultDate=df[resultDate]
sampleDate=df[sampleDate]

isinstance(resultDate, datetime) and isinstance(sampleDate, datetime), "Format the dates before applying this function"
delta=resultDate-sampleDate

if (pd.Timedelta(60, "d") < delta ) | (delta < pd.Timedelta(0, "d")):
Expand Down
19 changes: 0 additions & 19 deletions src/epipipeline/standardise/dengue/functions/assign_uuid.py

This file was deleted.

Original file line number Diff line number Diff line change
@@ -1,14 +1,15 @@
import pandas as pd
import re
import numpy as np

def extract_mobile_from_address(address: str):
def extract_contact(address: str) -> tuple:
"""This function extracts mobile number from the address/name fields and strips the name/address from the mobile number field
Args:
address (str): Name & Address in KA, and Address in PMC, PCMC, pune Rural
Returns:
pd.Series: DataFrame series of address & mobile number
tuple: DataFrame series of address & mobile number
"""
assert isinstance(address, str), "Invalid input"

Expand All @@ -18,6 +19,6 @@ def extract_mobile_from_address(address: str):
mobile_number=mobile_present.group(1)
address=re.sub(r"9?1?\d{10}","", address)
else:
mobile_number=pd.NA
mobile_number=np.nan

return pd.Series(address, mobile_number)
return (address, mobile_number)
12 changes: 3 additions & 9 deletions src/epipipeline/standardise/dengue/functions/geocode.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from googlemaps import Client as GoogleMaps
import googlemaps
import gmaps
import numpy as np

# 2 ways to retrieve API key - env file or local encryption
# Method 1 - Run this if you have set the API key in a .env file
Expand Down Expand Up @@ -31,7 +32,7 @@

# Geocoding function

def geocode(full_address: str, MyAPI: str) -> tuple:
def Geocode(full_address: str, MyAPI: str) -> tuple:
"""_summary_
Args:
Expand All @@ -52,11 +53,4 @@ def geocode(full_address: str, MyAPI: str) -> tuple:
raise Exception("geocoding failed")
except Exception as e:
print(f"Unable to geocode {full_address}")
return None,None

# run function

# Apply function
# df["location.geometry.latitude.imputed"], df["location.geometry.longitude.imputed"] = zip(df["full_address"].apply(lambda x: geocode(x, MyAPI)))


return np.nan,np.nan
19 changes: 19 additions & 0 deletions src/epipipeline/standardise/dengue/functions/standardise_result.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
import re
import numpy as np

def standardise_result(x) -> str:
"""This function standardises results to positive or negative
Args:
x (str/int): Result in the raw dataset
Returns:
str: Negative, Positive or NaN
"""
if isinstance(x, str) or isinstance(x, int):
if re.search(r"-ve|Neg|Negative|No|0", str(x), re.IGNORECASE):
return "NEGATIVE"
elif re.search(r"NS1|IgM|D|Yes|\+ve|Pos|Positive|1", str(x), re.IGNORECASE):
return "POSITIVE"
return np.nan

0 comments on commit 00c5705

Please sign in to comment.