diff --git a/genie/validate.py b/genie/validate.py index 733749b5..049cde88 100644 --- a/genie/validate.py +++ b/genie/validate.py @@ -424,7 +424,7 @@ def get_invalid_allele_rows( allowed_comb_alleles: list, allowed_ind_alleles: list, ignore_case: bool = False, - allow_na: bool = False, + allow_na: bool = False ) -> pd.Index: """ Find invalid indices in a DataFrame column based on allowed allele values. @@ -439,7 +439,6 @@ def get_invalid_allele_rows( ignore_case (bool, optional): whether to perform case-insensitive matching allow_na (bool, optional): whether to allow NAs to be an allowed allele value or not. - Returns: pd.Index: A pandas index object indicating the row indices that don't match the allowed alleles @@ -456,10 +455,23 @@ def get_invalid_allele_rows( else: flags = 0 # no flags - matching_indices = input_data[input_col].str.match( - search_str, flags=flags, na=allow_na - ) - invalid_indices = input_data[~matching_indices].index + # special handling for all NA column + is_all_na = pd.isna(input_data[input_col]).all() + if is_all_na and allow_na: + invalid_indices = pd.Index([]) + elif is_all_na and not allow_na: + invalid_indices = input_data.index + else: + # convert numeric cols to string while preserving NAs in order to use str.match + transformed_data = input_data.copy() + transformed_data[input_col] = transform._convert_col_with_nas_to_str( + transformed_data, input_col + ) + + matching_indices = transformed_data[input_col].str.match( + search_str, flags=flags, na=allow_na + ) + invalid_indices = transformed_data[~matching_indices].index return invalid_indices diff --git a/tests/test_validate.py b/tests/test_validate.py index fa52bb4a..4e16cfee 100644 --- a/tests/test_validate.py +++ b/tests/test_validate.py @@ -866,6 +866,50 @@ def get_invalid_allele_rows_test_cases(): "ignore_case": True, "allow_na": True, }, + { + "name": "float_nas_not_allowed", + "input": pd.DataFrame( + {"REFERENCE_ALLELE": [1.5, 2.0, float("nan"), 3.5, 4.0]} + ), + "expected_index": pd.Index([0, 1, 2, 3, 4]), + "allowed_comb_alleles": ["A", "T", "C", "G", "-"], + "allowed_ind_alleles": [], + "ignore_case": True, + "allow_na": False, + }, + { + "name": "float_nas_allowed", + "input": pd.DataFrame( + {"REFERENCE_ALLELE": [1.5, 2.0, float("nan"), 3.5, 4.0]} + ), + "expected_index": pd.Index([0, 1, 3, 4]), + "allowed_comb_alleles": ["A", "T", "C", "G", "-"], + "allowed_ind_alleles": [], + "ignore_case": True, + "allow_na": True, + }, + { + "name": "all_missing_nas_allowed", + "input": pd.DataFrame( + {"REFERENCE_ALLELE": [float("nan"), float("nan"), float("nan")]} + ), + "expected_index": pd.Index([]), + "allowed_comb_alleles": ["A", "T", "C", "G", "-"], + "allowed_ind_alleles": [], + "ignore_case": True, + "allow_na": True, + }, + { + "name": "all_missing_nas_not_allowed", + "input": pd.DataFrame( + {"REFERENCE_ALLELE": [float("nan"), float("nan"), float("nan")]} + ), + "expected_index": pd.Index([0, 1, 2]), + "allowed_comb_alleles": ["A", "T", "C", "G", "-"], + "allowed_ind_alleles": [], + "ignore_case": True, + "allow_na": False, + }, ]