Skip to content

Commit

Permalink
add allele validation special handling for all nas and non-str cols
Browse files Browse the repository at this point in the history
  • Loading branch information
rxu17 committed Nov 8, 2023
1 parent 780ee66 commit da74efc
Show file tree
Hide file tree
Showing 2 changed files with 62 additions and 6 deletions.
24 changes: 18 additions & 6 deletions genie/validate.py
Original file line number Diff line number Diff line change
Expand Up @@ -424,7 +424,7 @@ def get_invalid_allele_rows(
allowed_comb_alleles: list,
allowed_ind_alleles: list,
ignore_case: bool = False,
allow_na: bool = False,
allow_na: bool = False
) -> pd.Index:
"""
Find invalid indices in a DataFrame column based on allowed allele values.
Expand All @@ -439,7 +439,6 @@ def get_invalid_allele_rows(
ignore_case (bool, optional): whether to perform case-insensitive matching
allow_na (bool, optional): whether to allow NAs to be an allowed allele
value or not.
Returns:
pd.Index: A pandas index object indicating the row indices that
don't match the allowed alleles
Expand All @@ -456,10 +455,23 @@ def get_invalid_allele_rows(
else:
flags = 0 # no flags

matching_indices = input_data[input_col].str.match(
search_str, flags=flags, na=allow_na
)
invalid_indices = input_data[~matching_indices].index
# special handling for all NA column
is_all_na = pd.isna(input_data[input_col]).all()
if is_all_na and allow_na:
invalid_indices = pd.Index([])
elif is_all_na and not allow_na:
invalid_indices = input_data.index
else:
# convert numeric cols to string while preserving NAs in order to use str.match
transformed_data = input_data.copy()
transformed_data[input_col] = transform._convert_col_with_nas_to_str(
transformed_data, input_col
)

matching_indices = transformed_data[input_col].str.match(
search_str, flags=flags, na=allow_na
)
invalid_indices = transformed_data[~matching_indices].index
return invalid_indices


Expand Down
44 changes: 44 additions & 0 deletions tests/test_validate.py
Original file line number Diff line number Diff line change
Expand Up @@ -866,6 +866,50 @@ def get_invalid_allele_rows_test_cases():
"ignore_case": True,
"allow_na": True,
},
{
"name": "float_nas_not_allowed",
"input": pd.DataFrame(
{"REFERENCE_ALLELE": [1.5, 2.0, float("nan"), 3.5, 4.0]}
),
"expected_index": pd.Index([0, 1, 2, 3, 4]),
"allowed_comb_alleles": ["A", "T", "C", "G", "-"],
"allowed_ind_alleles": [],
"ignore_case": True,
"allow_na": False,
},
{
"name": "float_nas_allowed",
"input": pd.DataFrame(
{"REFERENCE_ALLELE": [1.5, 2.0, float("nan"), 3.5, 4.0]}
),
"expected_index": pd.Index([0, 1, 3, 4]),
"allowed_comb_alleles": ["A", "T", "C", "G", "-"],
"allowed_ind_alleles": [],
"ignore_case": True,
"allow_na": True,
},
{
"name": "all_missing_nas_allowed",
"input": pd.DataFrame(
{"REFERENCE_ALLELE": [float("nan"), float("nan"), float("nan")]}
),
"expected_index": pd.Index([]),
"allowed_comb_alleles": ["A", "T", "C", "G", "-"],
"allowed_ind_alleles": [],
"ignore_case": True,
"allow_na": True,
},
{
"name": "all_missing_nas_not_allowed",
"input": pd.DataFrame(
{"REFERENCE_ALLELE": [float("nan"), float("nan"), float("nan")]}
),
"expected_index": pd.Index([0, 1, 2]),
"allowed_comb_alleles": ["A", "T", "C", "G", "-"],
"allowed_ind_alleles": [],
"ignore_case": True,
"allow_na": False,
},
]


Expand Down

0 comments on commit da74efc

Please sign in to comment.