Skip to content

Commit

Permalink
Merge pull request #1 from dsih-artpark/development
Browse files Browse the repository at this point in the history
Merging stashed changes from dev to production
  • Loading branch information
snehasaisneha authored Feb 23, 2024
2 parents eeedb1d + c0abcf8 commit f692b96
Show file tree
Hide file tree
Showing 9 changed files with 711 additions and 119 deletions.
5 changes: 4 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -139,4 +139,7 @@ dmypy.json
**.Rhistory

# MacOS
.DS_Store
.DS_Store

# All Data Files
*.csv
294 changes: 293 additions & 1 deletion poetry.lock

Large diffs are not rendered by default.

5 changes: 4 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "epipipeline-v2"
version = "2.1.0"
version = "2.0.10"
description = ""
authors = ["Sai Sneha <[email protected]>"]
readme = "README.md"
Expand All @@ -11,6 +11,9 @@ pandas = "^2.2.0"
boto3 = "^1.34.40"
requests = "^2.31.0"
fuzzywuzzy = "^0.18.0"
openpyxl = "^3.1.2"
python-levenshtein = "^0.25.0"
pyyaml = "^6.0.1"


[build-system]
Expand Down
28 changes: 22 additions & 6 deletions src/epipipeline_v2/preprocess/dengue/karnataka.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,19 @@
import os


def read_ka_linelist(raw_data_dir, sheet_codes, regionIDs_dict, verbose=False):
def read_ka_linelist(raw_data_dir: str, sheet_codes: dict, regionIDs_dict: dict, verbose: bool = False):
"""Function to read Karnataka Linelists. Not yet completely robust.
Args:
raw_data_dir (str): directory containing xlsx files to be read. Currently only supports local directories.
sheet_codes (dict): mapper object (dictionary) that maps the tabs of the xlsx with district regionID based on LGD.
regionIDs_dict (dict): LGD Source of Truth in dictionary format.
verbose (bool, optional): Decides if script log should print. For development.
It's recommended to use the returned error for production. Defaults to False.
Returns:
: _description_
"""

raw_data_dict = dict()

Expand Down Expand Up @@ -35,7 +47,8 @@ def read_ka_linelist(raw_data_dir, sheet_codes, regionIDs_dict, verbose=False):


def preprocess_ka_linelist_v2(raw_data_dict, preprocess_metadata,
accepted_headers, regionIDs_dict):
accepted_headers, regionIDs_dict,
verbose=False):

error = []
preprocessed_data_dict = {}
Expand All @@ -50,7 +63,8 @@ def preprocess_ka_linelist_v2(raw_data_dict, preprocess_metadata,
if len(df) <= 1:
e = "District " + district_name + " (" + district + ") has no data."
error.append(e)
print(e)
if verbose:
print(e)
continue

# To account for empty excel sheets with one lone value in the 10000th row
Expand All @@ -66,7 +80,8 @@ def preprocess_ka_linelist_v2(raw_data_dict, preprocess_metadata,
if k == 5:
e = "District " + district_name + " (" + district + ") has no data."
error.append(e)
print(e)
if verbose:
print(e)
continue

if district in preprocess_metadata["no_merge_headers"]:
Expand Down Expand Up @@ -112,7 +127,7 @@ def preprocess_ka_linelist_v2(raw_data_dict, preprocess_metadata,
for option in name_options:
header_mapper[option] = standard_name

df = df.rename(column=header_mapper)
df = df.rename(columns=header_mapper)

# Rename all recognised columns to standard names

Expand Down Expand Up @@ -157,7 +172,8 @@ def preprocess_ka_linelist_v2(raw_data_dict, preprocess_metadata,
") is missing " + str(len(absent_headers)) + " header(s): " + \
", ".join(absent_headers) + "."
error.append(e)
print(e)
if verbose:
print(e)

preprocessed_data_dict[district] = df

Expand Down
Loading

0 comments on commit f692b96

Please sign in to comment.