diff --git a/schematic/models/validate_attribute.py b/schematic/models/validate_attribute.py index e196bbe14..9247ac79e 100644 --- a/schematic/models/validate_attribute.py +++ b/schematic/models/validate_attribute.py @@ -914,14 +914,14 @@ def get_target_manifests( def list_validation( self, val_rule: str, - manifest_col: pd.core.series.Series, - ) -> tuple[list[list[str]], list[list[str]], pd.core.series.Series]: + manifest_col: pd.Series, + ) -> tuple[list[list[str]], list[list[str]], pd.Series]: """ Purpose: Determine if values for a particular attribute are comma separated. Input: - val_rule: str, Validation rule - - manifest_col: pd.core.series.Series, column for a given attribute + - manifest_col: pd.Series, column for a given attribute Returns: - manifest_col: Input values in manifest arere-formatted to a list logger.error or logger.warning. @@ -932,8 +932,8 @@ def list_validation( # For each 'list' (input as a string with a , delimiter) entered, # convert to a real list of strings, with leading and trailing # white spaces removed. - errors = [] - warnings = [] + errors: list[list[str]] = [] + warnings: list[list[str]] = [] replace_null = True csv_re = comma_separated_list_regex() @@ -954,7 +954,10 @@ def list_validation( entry=list_string, node_display_name=manifest_col.name, ) - + # Because of the above line: manifest_col = manifest_col.astype(str) + # this column has been turned into a string, it's unclear if any values + # from this column can be anything other than a string, and therefore this + # if statement may not be needed if not isinstance(list_string, str) and entry_has_value: list_error = "not_a_string" elif not re.fullmatch(csv_re, list_string) and entry_has_value: @@ -983,7 +986,7 @@ def list_validation( def regex_validation( self, val_rule: str, - manifest_col: pd.core.series.Series, + manifest_col: pd.Series, ) -> tuple[list[list[str]], list[list[str]]]: """ Purpose: @@ -991,7 +994,7 @@ def regex_validation( provided in val_rule. Input: - val_rule: str, Validation rule - - manifest_col: pd.core.series.Series, column for a given + - manifest_col: pd.Series, column for a given attribute in the manifest - dmge: DataModelGraphExplorer Object Using this module requres validation rules written in the following manner: @@ -1005,8 +1008,8 @@ def regex_validation( - This function will return errors when the user input value does not match schema specifications. logger.error or logger.warning. - Errors: list[str] Error details for further storage. - warnings: list[str] Warning details for further storage. + Errors: list[list[str]] Error details for further storage. + warnings: list[list[str]] Warning details for further storage. TODO: move validation to convert step. """ @@ -1022,13 +1025,15 @@ def regex_validation( f" They should be provided as follows ['regex', 'module name', 'regular expression']" ) - errors = [] - warnings = [] + errors: list[list[str]] = [] + warnings: list[list[str]] = [] validation_rules = self.dmge.get_node_validation_rules( node_display_name=manifest_col.name ) - + # It seems like this statement can never be true + # self.dmge.get_node_validation_rules never returns a list with "::" even when + # the attribute has the "list::regex" rule if validation_rules and "::" in validation_rules[0]: validation_rules = validation_rules[0].split("::") # Handle case where validating re's within a list. @@ -1096,7 +1101,7 @@ def regex_validation( def type_validation( self, val_rule: str, - manifest_col: pd.core.series.Series, + manifest_col: pd.Series, ) -> tuple[list[list[str]], list[list[str]]]: """ Purpose: @@ -1105,7 +1110,7 @@ def type_validation( Input: - val_rule: str, Validation rule, specifying input type, either 'float', 'int', 'num', 'str' - - manifest_col: pd.core.series.Series, column for a given + - manifest_col: pd.Series, column for a given attribute in the manifest Returns: -This function will return errors when the user input value @@ -1123,8 +1128,8 @@ def type_validation( "str": (str), } - errors = [] - warnings = [] + errors: list[list[str]] = [] + warnings: list[list[str]] = [] # num indicates either a float or int. if val_rule == "num": @@ -1147,6 +1152,7 @@ def type_validation( ) if vr_errors: errors.append(vr_errors) + # It seems impossible to get warnings with type rules if vr_warnings: warnings.append(vr_warnings) elif val_rule in ["int", "float", "str"]: @@ -1169,6 +1175,7 @@ def type_validation( ) if vr_errors: errors.append(vr_errors) + # It seems impossible to get warnings with type rules if vr_warnings: warnings.append(vr_warnings) return errors, warnings @@ -1286,14 +1293,14 @@ def url_validation( return errors, warnings def _parse_validation_log( - self, validation_log: dict[str, pd.core.series.Series] - ) -> tuple[[list[str], list[str], list[str]]]: + self, validation_log: dict[str, pd.Series] + ) -> tuple[list[str], list[str], list[str]]: """Parse validation log, so values can be used to raise warnings/errors Args: - validation_log, dict[str, pd.core.series.Series]: + validation_log, dict[str, pd.Series]: Returns: invalid_rows, list: invalid rows recorded in the validation log - invalid_enties, list: invalid values recorded in the validation log + invalid_entities, list: invalid values recorded in the validation log manifest_ids, list: """ # Initialize parameters @@ -1313,12 +1320,15 @@ def _parse_validation_log( return invalid_rows, invalid_entries, manifest_ids def _merge_format_invalid_rows_values( - self, series_1: pd.core.series.Series, series_2: pd.core.series.Series - ) -> tuple[[list[str], list[str]]]: - """Merge two series to identify gather all invalid values, and parse out invalid rows and entries + self, series_1: pd.Series, series_2: pd.Series + ) -> tuple[list[str], list[str]]: + """ + Merge two series to identify gather all invalid values, + and parse out invalid rows and entries + Args: - series_1, pd.core.series.Series: first set of invalid values to extract - series_2, pd.core.series.Series: second set of invalid values to extract + series_1, pd.Series: first set of invalid values to extract + series_2, pd.Series: second set of invalid values to extract Returns: invalid_rows, list: invalid rows taken from both series invalid_entry, list: invalid values taken from both series @@ -1342,12 +1352,14 @@ def _merge_format_invalid_rows_values( return invalid_rows, invalid_entry def _format_invalid_row_values( - self, invalid_values: dict[str, pd.core.series.Series] - ) -> tuple[[list[str], list[str]]]: - """Parse invalid_values dictionary, to extract invalid_rows and invalid_entry to be used later - to raise warnings or errors. + self, invalid_values: pd.Series + ) -> tuple[list[str], list[str]]: + """ + Parse invalid_values, to extract invalid_rows and invalid_entry + to be used later to raise warnings or errors. + Args: - invalid_values, dict[str, pd.core.series.Series]: + invalid_values, pd.Series: Returns: invalid_rows, list: invalid rows recorded in invalid_values invalid_entry, list: invalid values recorded in invalid_values @@ -1383,9 +1395,9 @@ def _gather_set_warnings_errors( Returns: errors, list[str]: list of errors to raise, as appropriate, if values in current manifest do - not pass relevant cross mannifest validation across the target manifest(s) + not pass relevant cross manifest validation across the target manifest(s) warnings, list[str]: list of warnings to raise, as appropriate, if values in current manifest do - not pass relevant cross mannifest validation across the target manifest(s) + not pass relevant cross manifest validation across the target manifest(s) """ errors: list[str] = [] warnings: list[str] = [] @@ -1440,21 +1452,28 @@ def _remove_non_entry_from_invalid_entry_list( row_num: Optional[list[str]], attribute_name: str, ) -> tuple[list[str], list[str]]: - """Helper to remove NAs from a list of invalid entries (if applicable, and allowed), remove the row - too from row_num. This will make sure errors are not rasied for NA entries unless the value is required. + """ + Helper to remove NAs from a list of invalid entries (if applicable, and allowed), + remove the row too from row_num. This will make sure errors are not raised for + NA entries unless the value is required. + Args: invalid_entry, list[str]: default=None, list of entries in the source manifest where - invalid values were located. - row_num, list[str[: default=None, list of rows in the source manifest where invalid values were located + invalid values were located. + row_num, list[str[: default=None, list of rows in the source manifest where invalid + values were located attribute_name, str: source attribute name + Returns: - invalid_entry and row_num returned with any NA and corresponding row index value removed, if applicable. + invalid_entry and row_num returned with any NA and corresponding row index value + removed, if applicable. """ idx_to_remove = [] # Check if the current attribute column is required, via the data model if invalid_entry and row_num: # Check each invalid entry and determine if it has a value and/or is required. - # If there is no entry and its not required, remove the NA value so an error is not raised. + # If there is no entry and its not required, remove the NA value so an + # error is not raised. for idx, entry in enumerate(invalid_entry): entry_has_value = self.get_entry_has_value(entry, attribute_name) # If there is no value, and is not required, recored the index @@ -1466,8 +1485,8 @@ def _remove_non_entry_from_invalid_entry_list( for idx in sorted(idx_to_remove, reverse=True): del invalid_entry[idx] del row_num[idx] - # Perform check to make sure length of invalid_entry and row_num is the same. If not that would suggest - # there was an issue recording or removing values. + # Perform check to make sure length of invalid_entry and row_num is the same. + # If not that would suggest there was an issue recording or removing values. if len(invalid_entry) != len(row_num): logger.error( f"There was an error handling and validating a non-entry." @@ -1530,17 +1549,22 @@ def _gather_value_warnings_errors( source_attribute: str, value_validation_store: tuple[pd.Series, pd.Series, pd.Series], ) -> tuple[list[str], list[str]]: - """For value rule scope, find invalid rows and entries, and generate appropriate errors and warnings + """ + For value rule scope, find invalid rows and entries, and generate + appropriate errors and warnings + Args: val_rule, str: Validation rule source_attribute, str: source manifest column name value_validation_store, tuple(pd.Series, pd.Series, pd.Series]): contains missing_values, duplicated_values, and repeat values Returns: - errors, list[str]: list of errors to raise, as appropriate, if values in current manifest do - not pass relevant cross mannifest validation across the target manifest(s) - warnings, list[str]: list of warnings to raise, as appropriate, if values in current manifest do - not pass relevant cross mannifest validation across the target manifest(s) + errors, list[str]: list of errors to raise, as appropriate, if values + in current manifest do not pass relevant cross manifest validation + across the target manifest(s) + warnings, list[str]: list of warnings to raise, as appropriate, + if values in current manifest do not pass relevant cross manifest + validation across the target manifest(s) """ # Initialize with empty lists errors, warnings = [], [] @@ -1580,17 +1604,22 @@ def _gather_value_warnings_errors( def _check_if_target_manifest_is_empty( self, - target_manifest: pd.core.series.Series, + target_manifest: pd.DataFrame, target_manifest_empty: list[bool], column_names: dict[str, str], ) -> list[bool]: - """If a target manifest is found with the attribute column of interest check to see if the manifest is empty. + """ + If a target manifest is found with the attribute column of interest check to see if + the manifest is empty. + Args: - target_manifest, pd.core.series.Series: Current target manifest - target_manifest_empty, list[bool]: a list of booleans recording if the target manifest are emtpy or not. + target_manifest, pd.Dataframe: Current target manifest + target_manifest_empty, list[bool]: a list of booleans recording if the target manifest + are empty or not. column_names, dict[str, str]: {stripped_col_name:original_column_name} Returns: - target_manifest_empty, list[bool]: a list of booleans recording if the target manifest are emtpy or not. + target_manifest_empty, list[bool]: a list of booleans recording if the target manifest + are empty or not. """ # Make a copy of the target manifest with only user uploaded columns target_manifest_dupe = target_manifest.drop( @@ -2040,7 +2069,7 @@ def cross_validation( def filename_validation( self, val_rule: str, - manifest: pd.core.frame.DataFrame, + manifest: pd.DataFrame, access_token: str, dataset_scope: str, project_scope: Optional[list] = None, @@ -2050,7 +2079,7 @@ def filename_validation( Validate the filenames in the manifest against the data paths in the fileview. Args: val_rule: str, Validation rule for the component - manifest: pd.core.frame.DataFrame, manifest + manifest: pd.DataFrame, manifest access_token: str, Asset Store access token dataset_scope: str, Dataset with files to validate against project_scope: Optional[list] = None: Projects to limit the scope of cross manifest validation to. diff --git a/tests/unit/test_validate_attribute.py b/tests/unit/test_validate_attribute.py index d782fab12..fa1070428 100644 --- a/tests/unit/test_validate_attribute.py +++ b/tests/unit/test_validate_attribute.py @@ -3,9 +3,10 @@ from typing import Generator from unittest.mock import Mock, patch +from jsonschema import ValidationError +from pandas import Series, DataFrame, concat import numpy as np import pytest -from pandas import DataFrame, Series, concat import schematic.models.validate_attribute from schematic.models.validate_attribute import GenerateError, ValidateAttribute @@ -73,6 +74,15 @@ } ) +TEST_DF3 = DataFrame( + { + "PatientID": ["A", "A", "A", "B", "C"], + "component": ["comp1", "comp1", "comp1", "comp1", "comp1"], + "id": ["id1", "id2", "id3", "id4", "id5"], + "entityid": ["x", "x", "x", "x", "x"], + } +) + TEST_DF_MISSING_VALS = DataFrame( { "PatientID": [np.isnan, ""], @@ -155,10 +165,10 @@ def fixture_va_obj( yield ValidateAttribute(dmge) -@pytest.fixture(name="cross_val_df1") -def fixture_cross_val_df1() -> Generator[DataFrame, None, None]: +@pytest.fixture(name="test_df1") +def fixture_test_df1() -> Generator[DataFrame, None, None]: """Yields a dataframe""" - df = DataFrame( + yield DataFrame( { "PatientID": ["A", "B", "C"], "component": ["comp1", "comp1", "comp1"], @@ -166,44 +176,10 @@ def fixture_cross_val_df1() -> Generator[DataFrame, None, None]: "entityid": ["x", "x", "x"], } ) - yield df - - -@pytest.fixture(name="cross_val_df2") -def fixture_cross_val_df2(cross_val_df1: DataFrame) -> Generator[DataFrame, None, None]: - """Yields dataframe df1 with an extra row""" - df = concat( - [ - cross_val_df1, - DataFrame( - { - "PatientID": ["D"], - "component": ["comp1"], - "id": ["id4"], - "entityid": ["x"], - } - ), - ] - ) - yield df -@pytest.fixture(name="cross_val_df3") -def fixture_cross_val_df3() -> Generator[DataFrame, None, None]: - """Yields empty dataframe""" - df = DataFrame( - { - "PatientID": [], - "component": [], - "id": [], - "entityid": [], - } - ) - yield df - - -@pytest.fixture(name="cross_val_col_names") -def fixture_cross_val_col_names() -> Generator[dict[str, str], None, None]: +@pytest.fixture(name="test_df_col_names") +def fixture_test_df_col_names() -> Generator[dict[str, str], None, None]: """ Yields: Generator[dict[str, str], None, None]: A dicitonary of column names @@ -314,10 +290,10 @@ class TestValidateAttributeObject: @pytest.mark.parametrize("series", EXACTLY_ATLEAST_PASSING_SERIES) @pytest.mark.parametrize("rule", MATCH_ATLEAST_ONE_SET_RULES) - def test_cross_validation_match_atleast_one_set_rules_passing( + def test_cross_validation_match_atleast_one_set_passing_one_df( self, va_obj: ValidateAttribute, - cross_val_df1: DataFrame, + test_df1: DataFrame, series: Series, rule: str, ): @@ -328,16 +304,16 @@ def test_cross_validation_match_atleast_one_set_rules_passing( with patch.object( schematic.models.validate_attribute.ValidateAttribute, "_get_target_manifest_dataframes", - return_value={"syn1": cross_val_df1}, + return_value={"syn1": test_df1}, ): assert va_obj.cross_validation(rule, series) == ([], []) @pytest.mark.parametrize("series", EXACTLY_ATLEAST_PASSING_SERIES) @pytest.mark.parametrize("rule", MATCH_EXACTLY_ONE_SET_RULES) - def test_cross_validation_match_exactly_one_set_rules_passing( + def test_cross_validation_match_exactly_one_set_passing_one_df( self, va_obj: ValidateAttribute, - cross_val_df1: DataFrame, + test_df1: DataFrame, series: Series, rule: str, ): @@ -348,7 +324,7 @@ def test_cross_validation_match_exactly_one_set_rules_passing( with patch.object( schematic.models.validate_attribute.ValidateAttribute, "_get_target_manifest_dataframes", - return_value={"syn1": cross_val_df1}, + return_value={"syn1": test_df1}, ): assert va_obj.cross_validation(rule, series) == ([], []) @@ -362,10 +338,10 @@ def test_cross_validation_match_exactly_one_set_rules_passing( ], ) @pytest.mark.parametrize("rule", MATCH_ATLEAST_ONE_SET_RULES) - def test_cross_validation_match_atleast_one_set_rules_errors( + def test_cross_validation_match_atleast_one_set_errors_one_df( self, va_obj: ValidateAttribute, - cross_val_df1: DataFrame, + test_df1: DataFrame, series: Series, rule: str, ): @@ -376,7 +352,7 @@ def test_cross_validation_match_atleast_one_set_rules_errors( with patch.object( schematic.models.validate_attribute.ValidateAttribute, "_get_target_manifest_dataframes", - return_value={"syn1": cross_val_df1}, + return_value={"syn1": test_df1}, ): errors, warnings = va_obj.cross_validation(rule, series) if rule.endswith("error"): @@ -394,10 +370,10 @@ def test_cross_validation_match_atleast_one_set_rules_errors( ], ) @pytest.mark.parametrize("rule", MATCH_EXACTLY_ONE_SET_RULES) - def test_cross_validation_match_exactly_one_set_rules_errors( + def test_cross_validation_match_exactly_one_set_errors_one_df( self, va_obj: ValidateAttribute, - cross_val_df1: DataFrame, + test_df1: DataFrame, series: Series, rule: str, ): @@ -408,7 +384,7 @@ def test_cross_validation_match_exactly_one_set_rules_errors( with patch.object( schematic.models.validate_attribute.ValidateAttribute, "_get_target_manifest_dataframes", - return_value={"syn1": cross_val_df1, "syn2": cross_val_df1}, + return_value={"syn1": test_df1, "syn2": test_df1}, ): errors, warnings = va_obj.cross_validation(rule, series) if rule.endswith("error"): @@ -429,10 +405,10 @@ def test_cross_validation_match_exactly_one_set_rules_errors( ], ) @pytest.mark.parametrize("rule", MATCH_NONE_SET_RULES) - def test_cross_validation_match_none_set_rules_passing( + def test_cross_validation_match_none_set_passing_one_df( self, va_obj: ValidateAttribute, - cross_val_df1: DataFrame, + test_df1: DataFrame, series: Series, rule: str, ): @@ -443,7 +419,7 @@ def test_cross_validation_match_none_set_rules_passing( with patch.object( schematic.models.validate_attribute.ValidateAttribute, "_get_target_manifest_dataframes", - return_value={"syn1": cross_val_df1}, + return_value={"syn1": test_df1}, ): assert va_obj.cross_validation(rule, series) == ([], []) @@ -457,10 +433,10 @@ def test_cross_validation_match_none_set_rules_passing( ], ) @pytest.mark.parametrize("rule", MATCH_NONE_SET_RULES) - def test_cross_validation_match_none_set_rules_errors( + def test_cross_validation_match_none_set_errors_one_df( self, va_obj: ValidateAttribute, - cross_val_df1: DataFrame, + test_df1: DataFrame, series: Series, rule: str, ): @@ -471,7 +447,7 @@ def test_cross_validation_match_none_set_rules_errors( with patch.object( schematic.models.validate_attribute.ValidateAttribute, "_get_target_manifest_dataframes", - return_value={"syn1": cross_val_df1}, + return_value={"syn1": test_df1}, ): errors, warnings = va_obj.cross_validation(rule, series) if rule.endswith("error"): @@ -482,6 +458,7 @@ def test_cross_validation_match_none_set_rules_errors( assert errors == [] @pytest.mark.parametrize("rule", MATCH_ATLEAST_ONE_VALUE_RULES) + @pytest.mark.parametrize("target_manifest", [TEST_DF1, TEST_DF3]) @pytest.mark.parametrize( "tested_column", [ @@ -493,12 +470,12 @@ def test_cross_validation_match_none_set_rules_errors( (["A", "B", "C", "C"]), ], ) - def test_cross_validation_value_match_atleast_one_rules_passing( + def test_cross_validation_match_atleast_one_value_passing_one_df( self, va_obj: ValidateAttribute, - cross_val_df1: DataFrame, rule: str, tested_column: list, + target_manifest: DataFrame, ): """ Tests ValidateAttribute.cross_validation @@ -507,7 +484,7 @@ def test_cross_validation_value_match_atleast_one_rules_passing( with patch.object( schematic.models.validate_attribute.ValidateAttribute, "_get_target_manifest_dataframes", - return_value={"syn1": cross_val_df1}, + return_value={"syn1": target_manifest}, ): assert va_obj.cross_validation(rule, Series(tested_column)) == ([], []) @@ -522,10 +499,10 @@ def test_cross_validation_value_match_atleast_one_rules_passing( Series([1], index=[0], name="PatientID"), ], ) - def test_cross_validation_value_match_atleast_one_rules_errors( + def test_cross_validation_match_atleast_one_value_errors_one_df( self, va_obj: ValidateAttribute, - cross_val_df1: DataFrame, + test_df1: DataFrame, rule: str, tested_column: Series, ): @@ -536,7 +513,7 @@ def test_cross_validation_value_match_atleast_one_rules_errors( with patch.object( schematic.models.validate_attribute.ValidateAttribute, "_get_target_manifest_dataframes", - return_value={"syn1": cross_val_df1}, + return_value={"syn1": test_df1}, ): errors, warnings = va_obj.cross_validation(rule, tested_column) if rule.endswith("error"): @@ -548,31 +525,38 @@ def test_cross_validation_value_match_atleast_one_rules_errors( @pytest.mark.parametrize("rule", MATCH_EXACTLY_ONE_VALUE_RULES) @pytest.mark.parametrize( - "tested_column", + "tested_column, target_manifest", [ - ([]), - (["A"]), - (["A", "A"]), - (["A", "B"]), - (["A", "B", "C"]), - (["A", "B", "C", "C"]), + ([], TEST_DF1), + ([], TEST_DF3), + (["C"], TEST_DF1), + (["C"], TEST_DF3), + (["C", "C"], TEST_DF1), + (["C", "C"], TEST_DF3), + (["A"], TEST_DF1), + (["A", "A"], TEST_DF1), + (["A", "B"], TEST_DF1), + (["A", "B", "C"], TEST_DF1), + (["A", "B", "C", "C"], TEST_DF1), ], ) - def test_cross_validation_match_exactly_one_value_rules_passing( + def test_cross_validation_match_exactly_one_value_passing_one_df( self, va_obj: ValidateAttribute, - cross_val_df1: DataFrame, rule: str, tested_column: list, + target_manifest: DataFrame, ): """ Tests ValidateAttribute.cross_validation These tests show what columns pass for matchExactlyOne + The first group are ones that pass for TEST_DF1 and TEST_DF3 + The second group are those that pass only for test """ with patch.object( schematic.models.validate_attribute.ValidateAttribute, "_get_target_manifest_dataframes", - return_value={"syn1": cross_val_df1}, + return_value={"syn1": target_manifest}, ): assert va_obj.cross_validation(rule, Series(tested_column)) == ([], []) @@ -586,10 +570,10 @@ def test_cross_validation_match_exactly_one_value_rules_passing( Series([1], index=[0], name="PatientID"), ], ) - def test_cross_validation_value_match_exactly_one_rules_errors( + def test_cross_validation_match_exactly_one_value_errors_one_df( self, va_obj: ValidateAttribute, - cross_val_df1: DataFrame, + test_df1: DataFrame, rule: str, tested_column: Series, ): @@ -600,7 +584,7 @@ def test_cross_validation_value_match_exactly_one_rules_errors( with patch.object( schematic.models.validate_attribute.ValidateAttribute, "_get_target_manifest_dataframes", - return_value={"syn1": cross_val_df1}, + return_value={"syn1": test_df1}, ): errors, warnings = va_obj.cross_validation(rule, tested_column) if rule.endswith("error"): @@ -615,10 +599,10 @@ def test_cross_validation_value_match_exactly_one_rules_errors( "tested_column", [([]), (["D"]), (["D", "D"]), (["D", "F"]), ([1]), ([np.nan])], ) - def test_cross_validation_match_none_value_rules_passing( + def test_cross_validation_match_none_value_passing_one_df( self, va_obj: ValidateAttribute, - cross_val_df1: DataFrame, + test_df1: DataFrame, rule: str, tested_column: list, ): @@ -629,7 +613,7 @@ def test_cross_validation_match_none_value_rules_passing( with patch.object( schematic.models.validate_attribute.ValidateAttribute, "_get_target_manifest_dataframes", - return_value={"syn1": cross_val_df1}, + return_value={"syn1": test_df1}, ): assert va_obj.cross_validation(rule, Series(tested_column)) == ([], []) @@ -642,10 +626,10 @@ def test_cross_validation_match_none_value_rules_passing( Series(["A", "A"], index=[0, 1], name="PatientID"), ], ) - def test_cross_validation_value_match_none_rules_errors( + def test_cross_validation_value_match_none_errors_one_df( self, va_obj: ValidateAttribute, - cross_val_df1: DataFrame, + test_df1: DataFrame, rule: str, tested_column: Series, ): @@ -656,7 +640,7 @@ def test_cross_validation_value_match_none_rules_errors( with patch.object( schematic.models.validate_attribute.ValidateAttribute, "_get_target_manifest_dataframes", - return_value={"syn1": cross_val_df1}, + return_value={"syn1": test_df1}, ): errors, warnings = va_obj.cross_validation(rule, tested_column) if rule.endswith("error"): @@ -839,7 +823,7 @@ def test__run_validation_across_target_manifests_return_msg( @pytest.mark.parametrize("rule", ALL_VALUE_RULES) def test__run_validation_across_target_manifests_value_scope( - self, va_obj: ValidateAttribute, cross_val_df1: DataFrame, rule: str + self, va_obj: ValidateAttribute, test_df1: DataFrame, rule: str ) -> None: """Tests for ValidateAttribute._run_validation_across_target_manifests with value rule""" @@ -847,7 +831,7 @@ def test__run_validation_across_target_manifests_value_scope( with patch.object( schematic.models.validate_attribute.ValidateAttribute, "_get_target_manifest_dataframes", - return_value={"syn1": cross_val_df1}, + return_value={"syn1": test_df1}, ): _, validation_output = va_obj._run_validation_across_target_manifests( rule_scope="value", @@ -882,7 +866,7 @@ def test__run_validation_across_target_manifests_value_scope( def test__run_validation_across_target_manifests_match_atleast_exactly_with_one_target( self, va_obj: ValidateAttribute, - cross_val_df1: DataFrame, + test_df1: DataFrame, input_column: list, missing_ids: list[str], present_ids: list[str], @@ -895,12 +879,12 @@ def test__run_validation_across_target_manifests_match_atleast_exactly_with_one_ This shows that these rules behave the same. If all values in the column match the target manifest, the manifest id gets added to the present ids list. - Otherwise the maniferst id gets added to the missing ids list + Otherwise the manifest id gets added to the missing ids list """ with patch.object( schematic.models.validate_attribute.ValidateAttribute, "_get_target_manifest_dataframes", - return_value={"syn1": cross_val_df1}, + return_value={"syn1": test_df1}, ): _, validation_output = va_obj._run_validation_across_target_manifests( rule_scope="set", @@ -927,7 +911,7 @@ def test__run_validation_across_target_manifests_match_atleast_exactly_with_one_ def test__run_validation_across_target_manifests_match_atleast_exactly_with_two_targets( self, va_obj: ValidateAttribute, - cross_val_df1: DataFrame, + test_df1: DataFrame, input_column: list, missing_ids: list[str], present_ids: list[str], @@ -938,13 +922,13 @@ def test__run_validation_across_target_manifests_match_atleast_exactly_with_two_ Tests for ValidateAttribute._run_validation_across_target_manifests using matchAtleastOne set and matchExactlyOne rule. This shows these rules behave the same. - This also shows that when thare are multiple target mnaifests they both get added to + This also shows that when there are multiple target manifests they both get added to either the present of missing manifest ids """ with patch.object( schematic.models.validate_attribute.ValidateAttribute, "_get_target_manifest_dataframes", - return_value={"syn1": cross_val_df1, "syn2": cross_val_df1}, + return_value={"syn1": test_df1, "syn2": test_df1}, ): _, validation_output = va_obj._run_validation_across_target_manifests( rule_scope="set", @@ -974,7 +958,7 @@ def test__run_validation_across_target_manifests_match_atleast_exactly_with_two_ def test__run_validation_across_target_manifests_set_rules_match_none_with_one_target( self, va_obj: ValidateAttribute, - cross_val_df1: DataFrame, + test_df1: DataFrame, input_column: list, missing_ids: list[str], present_ids: list[str], @@ -985,13 +969,13 @@ def test__run_validation_across_target_manifests_set_rules_match_none_with_one_t Tests for ValidateAttribute._run_validation_across_target_manifests using matchNone set rule When there are nt matching values, no id get added - When there are mathcing values the id gets added to the repeat ids + When there are matching values the id gets added to the repeat ids """ with patch.object( schematic.models.validate_attribute.ValidateAttribute, "_get_target_manifest_dataframes", - return_value={"syn1": cross_val_df1}, + return_value={"syn1": test_df1}, ): _, validation_output = va_obj._run_validation_across_target_manifests( rule_scope="set", @@ -1021,7 +1005,7 @@ def test__run_validation_across_target_manifests_set_rules_match_none_with_one_t def test__run_validation_across_target_manifests_set_rules_match_none_with_two_targets( self, va_obj: ValidateAttribute, - cross_val_df1: DataFrame, + test_df1: DataFrame, input_column: list, missing_ids: list[str], present_ids: list[str], @@ -1032,13 +1016,13 @@ def test__run_validation_across_target_manifests_set_rules_match_none_with_two_t Tests for ValidateAttribute._run_validation_across_target_manifests using matchNone set rule When there are nt matching values, no id get added - When there are mathcing values the id gets added to the repeat ids + When there are matching values the id gets added to the repeat ids """ with patch.object( schematic.models.validate_attribute.ValidateAttribute, "_get_target_manifest_dataframes", - return_value={"syn1": cross_val_df1, "syn2": cross_val_df1}, + return_value={"syn1": test_df1, "syn2": test_df1}, ): _, validation_output = va_obj._run_validation_across_target_manifests( rule_scope="set", @@ -1115,11 +1099,11 @@ def test__run_validation_across_targets_value( ("syn3", ["syn1"], ["syn1", "syn3"]), ], ) - def test__run_validation_across_targets_set_match_exactly_atleaset_one_no_missing_values( + def test__run_validation_across_targets_set_match_exactly_atleast_one_no_missing_values( self, va_obj: ValidateAttribute, - cross_val_col_names: dict[str, str], - cross_val_df1: DataFrame, + test_df_col_names: dict[str, str], + test_df1: DataFrame, rule: str, tested_column: list, target_id: str, @@ -1134,10 +1118,10 @@ def test__run_validation_across_targets_set_match_exactly_atleaset_one_no_missin """ output, bool_list1, bool_list2 = va_obj._run_validation_across_targets_set( val_rule=rule, - column_names=cross_val_col_names, + column_names=test_df_col_names, manifest_col=Series(tested_column), target_attribute="patientid", - target_manifest=cross_val_df1, + target_manifest=test_df1, target_manifest_id=target_id, missing_manifest_log={}, present_manifest_log=present_log_input.copy(), @@ -1162,11 +1146,11 @@ def test__run_validation_across_targets_set_match_exactly_atleaset_one_no_missin (["D", "F"], "syn3", [], []), ], ) - def test__run_validation_across_targets_set_match_exactly_atleaset_one_missing_values( + def test__run_validation_across_targets_set_match_exactly_atleast_one_missing_values( self, va_obj: ValidateAttribute, - cross_val_col_names: dict[str, str], - cross_val_df1: DataFrame, + test_df_col_names: dict[str, str], + test_df1: DataFrame, rule: str, tested_column: list, target_id: str, @@ -1179,10 +1163,10 @@ def test__run_validation_across_targets_set_match_exactly_atleaset_one_missing_v """ output, bool_list1, bool_list2 = va_obj._run_validation_across_targets_set( val_rule=rule, - column_names=cross_val_col_names, + column_names=test_df_col_names, manifest_col=Series(tested_column), target_attribute="patientid", - target_manifest=cross_val_df1, + target_manifest=test_df1, target_manifest_id=target_id, missing_manifest_log={}, present_manifest_log=present_log_input.copy(), @@ -1199,17 +1183,17 @@ def test__run_validation_across_targets_set_match_exactly_atleaset_one_missing_v def test__run_validation_across_targets_set_match_none( self, va_obj: ValidateAttribute, - cross_val_col_names: dict[str, str], - cross_val_df1: DataFrame, + test_df_col_names: dict[str, str], + test_df1: DataFrame, ) -> None: """Tests for ValidateAttribute._run_validation_across_targets_set for matchAtLeastOne""" output, bool_list1, bool_list2 = va_obj._run_validation_across_targets_set( val_rule="matchNone, Patient.PatientID, set", - column_names=cross_val_col_names, + column_names=test_df_col_names, manifest_col=Series(["A", "B", "C"]), target_attribute="patientid", - target_manifest=cross_val_df1, + target_manifest=test_df1, target_manifest_id="syn1", missing_manifest_log={}, present_manifest_log=[], @@ -1226,10 +1210,10 @@ def test__run_validation_across_targets_set_match_none( output, bool_list1, bool_list2 = va_obj._run_validation_across_targets_set( val_rule="matchNone, Patient.PatientID, set", - column_names=cross_val_col_names, + column_names=test_df_col_names, manifest_col=Series(["A"]), target_attribute="patientid", - target_manifest=cross_val_df1, + target_manifest=test_df1, target_manifest_id="syn2", missing_manifest_log={}, present_manifest_log=[], @@ -1270,7 +1254,7 @@ def test__gather_value_warnings_errors_passing( ) -> None: """ Tests for ValidateAttribute._gather_value_warnings_errors - For matchAtLeastOne to pass there must be no mssing values + For matchAtLeastOne to pass there must be no missing values For matchExactlyOne there must be no missing or duplicated values For matchNone there must be no repeat values """ @@ -1574,3 +1558,662 @@ def test__get_column_names( ) -> None: """Tests for ValidateAttribute._get_column_names""" assert va_obj._get_column_names(DataFrame(input_dict)) == expected_dict + + ############## + # get_no_entry + ############## + + @pytest.mark.parametrize( + "input_entry, node_name, expected", + [ + ("entry", "Check NA", False), + ("entry", "Check Date", False), + ("", "Check NA", False), + ("", "Check Date", True), + ], + ) + def test_get_no_entry( + self, + va_obj: ValidateAttribute, + input_entry: str, + node_name: str, + expected: bool, + ) -> None: + """ + This test shows that: + - if the entry is a normal string the result is always False(not no entry), + - if the entry is "" the result is False if the attribute has the "isNA" rule + - if the entry is "" the result is True if the attribute does not have the "isNA" rule + """ + assert va_obj.get_no_entry(input_entry, node_name) is expected + + ##################### + # get_entry_has_value + ##################### + + @pytest.mark.parametrize( + "input_entry, node_name, expected", + [ + ("entry", "Check NA", True), + ("entry", "Check Date", True), + ("", "Check NA", True), + ("", "Check Date", False), + ], + ) + def test_get_entry_has_value( + self, + va_obj: ValidateAttribute, + input_entry: str, + node_name: str, + expected: bool, + ) -> None: + """ + This test shows that: + - if the entry is a normal string the result is always True, + - if the entry is "" the result is True if the attribute has the "isNA" rule + - if the entry is "" the result is False if the attribute does not have the "isNA" rule + """ + assert va_obj.get_entry_has_value(input_entry, node_name) is expected + + ################# + # list_validation + ################# + + @pytest.mark.parametrize( + "input_column, rule", + [ + (Series(["x,x,x"], name="Check List"), "list like"), + (Series(["x,x,x"], name="Check List"), "list strict"), + (Series([], name="Check List"), "list like"), + (Series([], name="Check List"), "list strict"), + (Series(["x"], name="Check List"), "list like"), + (Series(["xxx"], name="Check List"), "list like"), + (Series(["1"], name="Check List"), "list like"), + (Series([1], name="Check List"), "list like"), + (Series([1.1], name="Check List"), "list like"), + (Series([1, 1, 1], name="Check List"), "list like"), + (Series([np.nan], name="Check List"), "list like"), + (Series([True], name="Check List"), "list like"), + ], + ) + def test_list_validation_passing( + self, va_obj: ValidateAttribute, input_column: Series, rule: str + ) -> None: + """ + This tests ValidateAttribute.list_validation + This test shows that: + - when using list like, just about anything is validated + - when using list strict, empty columns, and comma separated strings pass + + """ + errors, warnings, _ = va_obj.list_validation(rule, input_column) + assert len(errors) == 0 + assert len(warnings) == 0 + + @pytest.mark.parametrize( + "input_column", + [ + (Series(["x"], name="Check List")), + (Series(["xxxx"], name="Check List")), + (Series([1], name="Check List")), + (Series([1.1], name="Check List")), + (Series([1, 1, 1], name="Check List")), + (Series([np.nan], name="Check List")), + (Series([True], name="Check List")), + ], + ) + @pytest.mark.parametrize("rule", ["list strict", "list strict warning"]) + def test_list_validation_not_passing( + self, va_obj: ValidateAttribute, input_column: Series, rule: str + ) -> None: + """ + This tests ValidateAttribute.list_validation + This test shows what doesn't pass when using list strict + """ + errors, warnings, _ = va_obj.list_validation(rule, input_column) + if rule.endswith("warning"): + assert len(errors) == 0 + assert len(warnings) > 0 + else: + assert len(errors) > 0 + assert len(warnings) == 0 + + ################## + # regex_validation + ################## + + @pytest.mark.parametrize( + "input_column, rule", + [ + (Series(["a"], name="Check List"), "regex match [a-f]"), + (Series(["a,b,"], name="Check Regex List Strict"), "regex match [a-f]"), + ], + ) + def test_regex_validation_passing( + self, va_obj: ValidateAttribute, input_column: Series, rule: str + ) -> None: + """ + This tests ValidateAttribute.regex_validation + This test shows passing examples using the match rule + """ + errors, warnings = va_obj.regex_validation(rule, input_column) + assert len(errors) == 0 + assert len(warnings) == 0 + + @pytest.mark.parametrize( + "input_column, rule", + [ + (Series(["g"], name="Check List"), "regex match [a-f]"), + (Series(["a,b,c,g"], name="Check Regex List Strict"), "regex match [a-f]"), + ], + ) + def test_regex_validation_failing( + self, va_obj: ValidateAttribute, input_column: Series, rule: str + ) -> None: + """ + This tests ValidateAttribute.regex_validation + This test shows failing examples using the match rule + """ + errors, warnings = va_obj.regex_validation(rule, input_column) + assert len(errors) == 1 + assert len(warnings) == 0 + + @pytest.mark.parametrize( + "input_column, rule, exception", + [ + (Series(["a"]), "", ValidationError), + (Series(["a"]), "regex", ValidationError), + (Series(["a"]), "regex match", ValidationError), + (Series(["a"]), "regex match [a-f]", ValueError), + ], + ) + def test_regex_validation_exceptions( + self, va_obj: ValidateAttribute, input_column: Series, rule: str, exception + ) -> None: + """ + This tests ValidateAttribute.regex_validation + This test shows that: + - when the rule is malformed, a ValidationError is raised + - when the input series has no name, a ValueError is raised + + """ + with pytest.raises(exception): + va_obj.regex_validation(rule, input_column) + + @pytest.mark.parametrize( + "input_column, rule", + [ + (Series(["a,b,c"], name="Check Regex List"), "list::regex match [a-f]"), + ( + Series(["a,b,c", "d,e,f"], name="Check Regex List"), + "list::regex match [a-f]", + ), + ], + ) + def test_regex_validation_with_list_column( + self, va_obj: ValidateAttribute, input_column: Series, rule: str + ) -> None: + """ + This tests ValidateAttribute.regex_validation using a list column + """ + errors, warnings = va_obj.regex_validation(rule, input_column) + assert len(errors) == 0 + assert len(warnings) == 0 + + ################# + # type_validation + ################# + + @pytest.mark.parametrize( + "input_column, rule", + [ + (Series(["a"], name="Check String"), "str"), + (Series([1], name="Check Num"), "num"), + (Series([1], name="Check Int"), "int"), + (Series([1.1], name="Check Float"), "float"), + (Series([np.nan], name="Check String"), "str"), + (Series([np.nan], name="Check Num"), "num"), + (Series([np.nan], name="Check Int"), "int"), + (Series([np.nan], name="Check Float"), "float"), + ], + ) + def test_type_validation_passing( + self, va_obj: ValidateAttribute, input_column: Series, rule: str + ) -> None: + """ + This tests ValidateAttribute.type_validation + This test shows passing examples using the type rule + """ + errors, warnings = va_obj.type_validation(rule, input_column) + assert len(errors) == 0 + assert len(warnings) == 0 + + @pytest.mark.parametrize( + "input_column, rule", + [ + (Series([1], name="Check String"), "str"), + (Series(["a"], name="Check Num"), "num"), + (Series(["20"], name="Check Num"), "num"), + (Series([1.1], name="Check Int"), "int"), + (Series(["a"], name="Check Int"), "int"), + (Series([1], name="Check Float"), "float"), + (Series(["a"], name="Check Float"), "float"), + ], + ) + def test_type_validation_failing( + self, va_obj: ValidateAttribute, input_column: Series, rule: str + ) -> None: + """ + This tests ValidateAttribute.type_validation + This test shows failing examples using the type rule + """ + errors, warnings = va_obj.type_validation(rule, input_column) + assert len(errors) == 1 + assert len(warnings) == 0 + + @pytest.mark.parametrize( + "input_column, rule", + [ + (Series([1], name="Check String"), "str error"), + (Series([1], name="Check String"), "str warning"), + ], + ) + def test_type_validation_does_not_work( + self, va_obj: ValidateAttribute, input_column: Series, rule: str + ) -> None: + """ + This tests ValidateAttribute.type_validation + This test shows that the msg level parameter doesn't work + """ + errors, warnings = va_obj.type_validation(rule, input_column) + assert len(errors) == 0 + assert len(warnings) == 0 + + ################ + # url_validation + ################ + + @pytest.mark.parametrize( + "input_column", + [ + (Series([], name="Check URL")), + (Series([np.nan], name="Check URL")), + ( + Series( + ["https://doi.org/10.1158/0008-5472.can-23-0128"], name="Check URL" + ) + ), + ], + ) + def test_url_validation_passing( + self, + va_obj: ValidateAttribute, + input_column: Series, + ) -> None: + """ + This tests ValidateAttribute.url_validation + This test shows passing examples using the url rule + """ + errors, warnings = va_obj.url_validation("url", input_column) + assert len(errors) == 0 + assert len(warnings) == 0 + + @pytest.mark.parametrize( + "input_column", + [(Series([""], name="Check URL")), (Series(["xxx"], name="Check URL"))], + ) + def test_url_validation_failing( + self, + va_obj: ValidateAttribute, + input_column: Series, + ) -> None: + """ + This tests ValidateAttribute.url_validation + This test shows failing examples using the url rule + """ + errors, warnings = va_obj.url_validation("url", input_column) + assert len(errors) > 0 + assert len(warnings) == 0 + + ####################### + # _parse_validation_log + ####################### + + @pytest.mark.parametrize( + "input_log, expected_invalid_rows, expected_invalid_entities, expected_manifest_ids", + [ + ({}, [], [], []), + ({"syn1": Series(["A"])}, ["2"], ["A"], ["syn1"]), + ({"syn1": Series(["A"], index=[1])}, ["3"], ["A"], ["syn1"]), + ({"syn1": Series(["A", "B"])}, ["2"], ["A"], ["syn1"]), + ( + {"syn1": Series(["A"]), "syn2": Series(["B"])}, + ["2"], + ["A", "B"], + ["syn1", "syn2"], + ), + ], + ) + def test__parse_validation_log( + self, + va_obj: ValidateAttribute, + input_log: dict[str, Series], + expected_invalid_rows: list[str], + expected_invalid_entities: list[str], + expected_manifest_ids: list[str], + ) -> None: + """ + This test shows that + - an empty log returns empty values + - only the first value in each series is returned as invalid entities + - the index of the invalid entity is returned incremented by 2 + - each manifest entity is returned + + """ + invalid_rows, invalid_entities, manifest_ids = va_obj._parse_validation_log( + input_log + ) + assert invalid_rows == expected_invalid_rows + assert sorted(invalid_entities) == expected_invalid_entities + assert manifest_ids == expected_manifest_ids + + ################################### + # _merge_format_invalid_rows_values + ################################### + + @pytest.mark.parametrize( + "input_series1, input_series2, expected_invalid_rows, expected_invalid_entry", + [ + (Series([], name="x"), Series([], name="x"), [], []), + (Series(["A"], name="x"), Series([], name="x"), ["2"], ["A"]), + (Series([], name="x"), Series(["B"], name="x"), ["2"], ["B"]), + (Series(["A"], name="x"), Series(["B"], name="x"), ["2"], ["A", "B"]), + (Series(["A"], name="x"), Series(["C"], name="x"), ["2"], ["A", "C"]), + ( + Series(["A", "B"], name="x"), + Series(["C"], name="x"), + ["2", "3"], + ["A", "B", "C"], + ), + ], + ) + def test__merge_format_invalid_rows_values( + self, + va_obj: ValidateAttribute, + input_series1: Series, + input_series2: Series, + expected_invalid_rows: list[str], + expected_invalid_entry: list[str], + ) -> None: + """ + This test shows that + - the names of the series must match + - the indices of both series get combined and incremented by 2 + - the values of both series are combined + """ + invalid_rows, invalid_entry = va_obj._merge_format_invalid_rows_values( + input_series1, input_series2 + ) + assert invalid_rows == expected_invalid_rows + assert invalid_entry == expected_invalid_entry + + ############################ + # _format_invalid_row_values + ############################ + + @pytest.mark.parametrize( + "input_series, expected_invalid_rows, expected_invalid_entry", + [ + (Series([]), [], []), + (Series(["A"]), ["2"], ["A"]), + (Series(["A", "B"]), ["2", "3"], ["A", "B"]), + ], + ) + def test__format_invalid_row_values( + self, + va_obj: ValidateAttribute, + input_series: Series, + expected_invalid_rows: list[str], + expected_invalid_entry: list[str], + ) -> None: + """ + This test shows that the indices of the input series is incremented by 2 + """ + invalid_rows, invalid_entry = va_obj._format_invalid_row_values(input_series) + assert invalid_rows == expected_invalid_rows + assert invalid_entry == expected_invalid_entry + + ########################################### + # _remove_non_entry_from_invalid_entry_list + ########################################### + + @pytest.mark.parametrize( + "input_entry, input_row_num, input_name, expected_invalid_entry, expected_row_num", + [ + # Cases where entry and row number remain unchanged + ([], [], "", [], []), + (None, None, "", None, None), + (["A"], None, "", ["A"], None), + (None, ["1"], "", None, ["1"]), + (["A"], ["1"], "x", ["A"], ["1"]), + (["A", "B"], ["1"], "x", ["A", "B"], ["1"]), + (["A"], ["1", "2"], "x", ["A"], ["1", "2"]), + # When there are missing values the value and the row number are removed + ([""], ["1"], "x", [], []), + (["", ""], ["1", "2"], "x", [], []), + (["", "A"], ["1", "2"], "x", ["A"], ["2"]), + # When there are more row numbers than values, and there are missing values + # then the row number that corresponds to the missing value is removed + ([""], ["1", "2"], "x", [], ["2"]), + (["", ""], ["1", "2", "3", "4"], "x", [], ["3", "4"]), + (["", "A"], ["1", "2", "3", "4"], "x", ["A"], ["2", "3", "4"]), + (["A", ""], ["1", "2", "3", "4"], "x", ["A"], ["1", "3", "4"]), + ], + ) + def test__remove_non_entry_from_invalid_entry_list( + self, + va_obj: ValidateAttribute, + input_entry: list[str], + input_row_num: list[str], + input_name: str, + expected_invalid_entry: list[str], + expected_row_num: list[str], + ) -> None: + """ + Tests for ValidateAttribute.remove_non_entry_from_invalid_entry_list + """ + invalid_entry, row_num = va_obj._remove_non_entry_from_invalid_entry_list( + input_entry, input_row_num, input_name + ) + assert invalid_entry == expected_invalid_entry + assert row_num == expected_row_num + + @pytest.mark.parametrize( + "input_entry, input_row_num, input_name, exception", + [ + # if first two inputs are not empty, an empty name string causes an IndexError + (["A"], ["1"], "", IndexError), + # if there are more invalid entries than row numbers, there is an IndexError + (["", ""], ["1"], "x", IndexError), + ], + ) + def test__remove_non_entry_from_invalid_entry_list_exceptions( + self, + va_obj: ValidateAttribute, + input_entry: list[str], + input_row_num: list[str], + input_name: str, + exception: Exception, + ) -> None: + """ + Tests for ValidateAttribute.remove_non_entry_from_invalid_entry_list that cause + exceptions + """ + with pytest.raises(exception): + va_obj._remove_non_entry_from_invalid_entry_list( + input_entry, input_row_num, input_name + ) + + #################################### + # _check_if_target_manifest_is_empty + #################################### + + @pytest.mark.parametrize( + "input_dataframe, input_bool_list, input_column_dict, output_bool_list", + [ + # Dataframes with only required columns are always considered_empty + ( + DataFrame({"component": [], "id": [], "entityid": []}), + [], + {"component": "component", "id": "id", "entityid": "entityid"}, + [True], + ), + ( + DataFrame({"component": ["xxx"], "id": ["xxx"], "entityid": ["xxx"]}), + [], + {"component": "component", "id": "id", "entityid": "entityid"}, + [True], + ), + # Dataframes with non-required columns whose only values are null are considered empty + ( + DataFrame( + { + "component": ["xxx"], + "id": ["xxx"], + "entityid": ["xxx"], + "col1": [np.nan], + } + ), + [], + {"component": "component", "id": "id", "entityid": "entityid"}, + [True], + ), + ( + DataFrame( + { + "component": ["xxx"], + "id": ["xxx"], + "entityid": ["xxx"], + "col1": [np.nan], + "col2": [np.nan], + } + ), + [], + {"component": "component", "id": "id", "entityid": "entityid"}, + [True], + ), + # Dataframes with non-required columns who have non-null values are not considered empty + ( + DataFrame( + { + "component": ["xxx"], + "id": ["xxx"], + "entityid": ["xxx"], + "col1": ["xxx"], + } + ), + [], + {"component": "component", "id": "id", "entityid": "entityid"}, + [False], + ), + ( + DataFrame( + { + "component": ["xxx"], + "id": ["xxx"], + "entityid": ["xxx"], + "col1": [np.nan], + "col2": ["xxx"], + } + ), + [], + {"component": "component", "id": "id", "entityid": "entityid"}, + [False], + ), + ], + ) + def test__check_if_target_manifest_is_empty( + self, + va_obj: ValidateAttribute, + input_dataframe: DataFrame, + input_bool_list: list[bool], + input_column_dict: dict[str, str], + output_bool_list: list[bool], + ) -> None: + """ + Tests for ValidateAttribute._check_if_target_manifest_is_empty + """ + bool_list = va_obj._check_if_target_manifest_is_empty( + input_dataframe, input_bool_list, input_column_dict + ) + assert bool_list == output_bool_list + + @pytest.mark.parametrize( + "input_dataframe, input_bool_list, input_column_dict, exception", + [ + # column name dict must have keys "component", "id", "entityid" + (DataFrame({"component": [], "id": [], "entityid": []}), [], {}, KeyError), + # dataframe must have columns "component", "id", "entityid" + ( + DataFrame(), + [], + {"component": "component", "id": "id", "entityid": "entityid"}, + KeyError, + ), + ], + ) + def test__check_if_target_manifest_is_empty_exceptions( + self, + va_obj: ValidateAttribute, + input_dataframe: DataFrame, + input_bool_list: list[bool], + input_column_dict: dict[str, str], + exception: Exception, + ) -> None: + """ + Tests for ValidateAttribute._check_if_target_manifest_is_empty that cause + exceptions + """ + with pytest.raises(exception): + va_obj._check_if_target_manifest_is_empty( + input_dataframe, input_bool_list, input_column_dict + ) + + ################# + # _get_rule_scope + ################# + + @pytest.mark.parametrize( + "input_rule, output_scope", + [ + # After splitting by spaces, the third element is returned + ("a b c", "c"), + ("a b c d", "c"), + ], + ) + def test__get_rule_scope( + self, va_obj: ValidateAttribute, input_rule: str, output_scope: str + ) -> None: + """ + Tests for ValidateAttribute._get_rule_scope + """ + assert va_obj._get_rule_scope(input_rule) == output_scope + + @pytest.mark.parametrize( + "input_rule, exception", + [ + # The rule must a string when split by spaces, have atleast three elements + ("", IndexError), + ("x", IndexError), + ("x x", IndexError), + ("x;x;x", IndexError), + ], + ) + def test__get_rule_scope_exceptions( + self, va_obj: ValidateAttribute, input_rule: str, exception: Exception + ) -> None: + """ + Tests for ValidateAttribute._get_rule_scope that cause exceptions + """ + with pytest.raises(exception): + va_obj._get_rule_scope(input_rule)