Skip to content

Commit

Permalink
reformat code
Browse files Browse the repository at this point in the history
  • Loading branch information
danlu1 committed Nov 13, 2024
1 parent d95d684 commit ab7c931
Show file tree
Hide file tree
Showing 2 changed files with 109 additions and 48 deletions.
61 changes: 44 additions & 17 deletions genie/process_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -982,7 +982,13 @@ def create_missing_columns(dataset: pd.DataFrame, schema: dict) -> pd.Series:
return dataset[list(schema.keys())]


def get_row_indices_for_invalid_column_values(df: pd.DataFrame, col: str, possible_values: list, na_allowed: bool = False, sep: Optional[str] = None) -> pd.Index:
def get_row_indices_for_invalid_column_values(
df: pd.DataFrame,
col: str,
possible_values: list,
na_allowed: bool = False,
sep: Optional[str] = None,
) -> pd.Index:
"""This function checks the column values against possible_values and returns row indices of invalid rows.
Currently, this function is only used in assay.py
Expand All @@ -994,7 +1000,7 @@ def get_row_indices_for_invalid_column_values(df: pd.DataFrame, col: str, possib
sep (Optional[str], optional): The string separator. Defaults to None.
Returns:
pd.Index: The row indices of the rows with values that are not in possible_values.
pd.Index: The row indices of the rows with values that are not in possible_values.
"""
if na_allowed:
# this is only useful for dropping NAs for individual values rather than value_list
Expand All @@ -1003,12 +1009,17 @@ def get_row_indices_for_invalid_column_values(df: pd.DataFrame, col: str, possib
check_values = df[col]
if sep:
# for columns contain lists of values
check_values = check_values.apply(lambda x: all(substring in possible_values for substring in x.split(sep)))
else:
check_values = check_values.apply(
lambda x: all(substring in possible_values for substring in x.split(sep))
)
else:
check_values = check_values.apply(lambda x: x in possible_values)
return check_values[check_values == False].index

def get_message_for_invalid_column_value(col: str, filename: str, invalid_indices: pd.Index, possible_values: list) -> tuple:

def get_message_for_invalid_column_value(
col: str, filename: str, invalid_indices: pd.Index, possible_values: list
) -> tuple:
"""This function returns the error and warning messages if the target column has rows with invalid values.
Currently, this function is only used in assay.py
Expand All @@ -1025,15 +1036,27 @@ def get_message_for_invalid_column_value(col: str, filename: str, invalid_indice
error = ""
# check the validity of values in the column
# concatenated possible values. This is done because of pandas typing. An integer column with one NA/blank value will be cast as a double.
possible_values = ", ".join([str(value).replace(".0", "")for value in possible_values])
if len(invalid_indices) > 0:
error = (f"{filename}: Please double check your {col} column. Valid values are {possible_values}. "
f"You have {len(invalid_indices)} row(s) in your file where {col} column contains invalid values. "
f"The row(s) this occurs in are: {invalid_indices.tolist()}. Please correct.\n")
possible_values = ", ".join(
[str(value).replace(".0", "") for value in possible_values]
)
if len(invalid_indices) > 0:
error = (
f"{filename}: Please double check your {col} column. Valid values are {possible_values}. "
f"You have {len(invalid_indices)} row(s) in your file where {col} column contains invalid values. "
f"The row(s) this occurs in are: {invalid_indices.tolist()}. Please correct.\n"
)
return (warning, error)


def check_column_and_values_row_specific(df: pd.DataFrame, col: str, possible_values: list, filename: str, na_allowed: bool = False, required=False, sep: Optional[str] = None) -> tuple:
def check_column_and_values_row_specific(
df: pd.DataFrame,
col: str,
possible_values: list,
filename: str,
na_allowed: bool = False,
required=False,
sep: Optional[str] = None,
) -> tuple:
"""This function checks if the column exists and checks if the values in the column have the valid values.
Currently, this function is only used in assay.py
Expand All @@ -1051,7 +1074,7 @@ def check_column_and_values_row_specific(df: pd.DataFrame, col: str, possible_va
"""
warning = ""
error = ""
# check the existence of the column
# check the existence of the column
have_column = checkColExist(df, col)
if not have_column:
if required:
Expand All @@ -1063,10 +1086,14 @@ def check_column_and_values_row_specific(df: pd.DataFrame, col: str, possible_va
"{filename}: Doesn't have {col} column. "
"This column will be added.\n".format(filename=filename, col=col)
)
else:
else:
# get the row indices
invalid_indices = get_row_indices_for_invalid_column_values(df, col, possible_values, na_allowed, sep)
invalid_indices = get_row_indices_for_invalid_column_values(
df, col, possible_values, na_allowed, sep
)
# generate validation message
warning, error = get_message_for_invalid_column_value(col, filename, invalid_indices, possible_values)

return (warning, error)
warning, error = get_message_for_invalid_column_value(
col, filename, invalid_indices, possible_values
)

return (warning, error)
96 changes: 65 additions & 31 deletions tests/test_process_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,12 @@
import pytest
import synapseclient
from genie import process_functions
from pandas.api.types import (is_bool_dtype, is_float_dtype, is_integer_dtype,
is_string_dtype)
from pandas.api.types import (
is_bool_dtype,
is_float_dtype,
is_integer_dtype,
is_string_dtype,
)
from pandas.testing import assert_frame_equal

DATABASE_DF = pd.DataFrame(
Expand Down Expand Up @@ -752,10 +756,20 @@ def get_row_indices_for_invalid_column_values_test_cases():
},
{
"name": "values_in_list",
"df": pd.DataFrame({"test_col": ["Val1;Val2", "Val1;Val2;Val3","Val1", "Val1;", "Val1;None"]}),
"df": pd.DataFrame(
{
"test_col": [
"Val1;Val2",
"Val1;Val2;Val3",
"Val1",
"Val1;",
"Val1;None",
]
}
),
"col": "test_col",
"possible_values": ["Val1", "Val2"],
"na_allowed": True,
"na_allowed": True,
"sep": ";",
"expected_index": pd.Index([1, 3, 4]),
},
Expand All @@ -769,18 +783,25 @@ def get_row_indices_for_invalid_column_values_test_cases():
"expected_index": pd.Index([]),
},
]


@pytest.mark.parametrize(
"test_cases", get_row_indices_for_invalid_column_values_test_cases(), ids=lambda x: x["name"]
"test_cases",
get_row_indices_for_invalid_column_values_test_cases(),
ids=lambda x: x["name"],
)
def test_get_row_indices_for_invalid_column_values(test_cases):
df = test_cases["df"]
col = test_cases["col"]
possible_values = test_cases["possible_values"]
na_allowed = test_cases["na_allowed"]
sep = test_cases["sep"]
results = process_functions.get_row_indices_for_invalid_column_values(df, col, possible_values, na_allowed, sep)
results = process_functions.get_row_indices_for_invalid_column_values(
df, col, possible_values, na_allowed, sep
)
assert results.equals(test_cases["expected_index"])


def get_message_for_invalid_column_value_test_cases():
return [
{
Expand All @@ -789,10 +810,10 @@ def get_message_for_invalid_column_value_test_cases():
"filename": "test_filename",
"invalid_indices": pd.Index([1, 2, 3]),
"possible_values": ["Val1"],
"expected_error": "test_filename: Please double check your test_col column. Valid values are Val1. "\
"You have 3 row(s) in your file where test_col column contains invalid values. "\
"The row(s) this occurs in are: [1, 2, 3]. Please correct.\n",
"expected_warning": ""
"expected_error": "test_filename: Please double check your test_col column. Valid values are Val1. "
"You have 3 row(s) in your file where test_col column contains invalid values. "
"The row(s) this occurs in are: [1, 2, 3]. Please correct.\n",
"expected_warning": "",
},
{
"name": "valid_data",
Expand All @@ -801,21 +822,28 @@ def get_message_for_invalid_column_value_test_cases():
"invalid_indices": pd.Index([]),
"possible_values": ["Val1", "Val2"],
"expected_error": "",
"expected_warning": ""
"expected_warning": "",
},
]


@pytest.mark.parametrize(
"test_cases", get_message_for_invalid_column_value_test_cases(), ids=lambda x: x["name"]
"test_cases",
get_message_for_invalid_column_value_test_cases(),
ids=lambda x: x["name"],
)
def test_get_message_for_invalid_column_value(test_cases):
col = test_cases["col"]
filename = test_cases["filename"]
invalid_indices = test_cases["invalid_indices"]
possible_values = test_cases["possible_values"]
warning, error = process_functions.get_message_for_invalid_column_value(col, filename, invalid_indices, possible_values)
warning, error = process_functions.get_message_for_invalid_column_value(
col, filename, invalid_indices, possible_values
)
assert warning == test_cases["expected_warning"]
assert error == test_cases["expected_error"]


def check_col_and_values_row_specific_test_cases():
return [
{
Expand All @@ -828,7 +856,7 @@ def check_col_and_values_row_specific_test_cases():
"required": True,
"sep": ";",
"expected_error": "",
"expected_warning": ""
"expected_warning": "",
},
{
"name": "valid_data_with_individual_value_na_allowed",
Expand All @@ -840,7 +868,7 @@ def check_col_and_values_row_specific_test_cases():
"required": True,
"sep": ";",
"expected_error": "",
"expected_warning": ""
"expected_warning": "",
},
{
"name": "missing_required_column",
Expand All @@ -852,7 +880,7 @@ def check_col_and_values_row_specific_test_cases():
"required": True,
"sep": ";",
"expected_error": "test_filename: Must have test_col1 column.\n",
"expected_warning": ""
"expected_warning": "",
},
{
"name": "missing_optional_column",
Expand All @@ -864,7 +892,7 @@ def check_col_and_values_row_specific_test_cases():
"required": False,
"sep": ";",
"expected_error": "",
"expected_warning": "test_filename: Doesn't have test_col1 column. This column will be added.\n"
"expected_warning": "test_filename: Doesn't have test_col1 column. This column will be added.\n",
},
{
"name": "invalid_data_with_value_list",
Expand All @@ -875,10 +903,10 @@ def check_col_and_values_row_specific_test_cases():
"na_allowed": True,
"required": True,
"sep": ";",
"expected_error": "test_filename: Please double check your test_col column. Valid values are Val1. "\
"You have 2 row(s) in your file where test_col column contains invalid values. "\
"The row(s) this occurs in are: [1, 2]. Please correct.\n",
"expected_warning": ""
"expected_error": "test_filename: Please double check your test_col column. Valid values are Val1. "
"You have 2 row(s) in your file where test_col column contains invalid values. "
"The row(s) this occurs in are: [1, 2]. Please correct.\n",
"expected_warning": "",
},
{
"name": "invalid_data_with_individual_value_na_not_allowed",
Expand All @@ -889,10 +917,10 @@ def check_col_and_values_row_specific_test_cases():
"na_allowed": False,
"required": True,
"sep": None,
"expected_error": "test_filename: Please double check your test_col column. Valid values are Val1, Val2. "\
"You have 3 row(s) in your file where test_col column contains invalid values. "\
"The row(s) this occurs in are: [2, 3, 4]. Please correct.\n",
"expected_warning": ""
"expected_error": "test_filename: Please double check your test_col column. Valid values are Val1, Val2. "
"You have 3 row(s) in your file where test_col column contains invalid values. "
"The row(s) this occurs in are: [2, 3, 4]. Please correct.\n",
"expected_warning": "",
},
{
"name": "invalid_data_with_individual_value_na_allowed",
Expand All @@ -903,14 +931,18 @@ def check_col_and_values_row_specific_test_cases():
"na_allowed": True,
"required": True,
"sep": None,
"expected_error": "test_filename: Please double check your test_col column. Valid values are Val1. "\
"You have 2 row(s) in your file where test_col column contains invalid values. "\
"The row(s) this occurs in are: [1, 2]. Please correct.\n",
"expected_warning": ""
"expected_error": "test_filename: Please double check your test_col column. Valid values are Val1. "
"You have 2 row(s) in your file where test_col column contains invalid values. "
"The row(s) this occurs in are: [1, 2]. Please correct.\n",
"expected_warning": "",
},
]


@pytest.mark.parametrize(
"test_cases", check_col_and_values_row_specific_test_cases(), ids=lambda x: x["name"]
"test_cases",
check_col_and_values_row_specific_test_cases(),
ids=lambda x: x["name"],
)
def test_check_col_and_values_row_specific(test_cases):
df = test_cases["df"]
Expand All @@ -920,6 +952,8 @@ def test_check_col_and_values_row_specific(test_cases):
na_allowed = test_cases["na_allowed"]
required = test_cases["required"]
sep = test_cases["sep"]
warning, error = process_functions.check_column_and_values_row_specific(df, col, possible_values, filename, na_allowed, required, sep)
warning, error = process_functions.check_column_and_values_row_specific(
df, col, possible_values, filename, na_allowed, required, sep
)
assert warning == test_cases["expected_warning"]
assert error == test_cases["expected_error"]

0 comments on commit ab7c931

Please sign in to comment.