From db442a28d1cfcbe0612deb734bb4bf2a519b0bb3 Mon Sep 17 00:00:00 2001 From: rxu17 <26471741+rxu17@users.noreply.github.com> Date: Mon, 18 Sep 2023 14:43:37 -0700 Subject: [PATCH] [GEN-846] Ignore case and allow underscores in cross-validate (#536) * add initial standardize string function for bed, clinical and assay cross validation * add remaining tests for clinical checks against maf and bed with ignore case and allow underscore * add to contributing guidelines for release --------- Co-authored-by: Rixing Xu --- CONTRIBUTING.md | 8 +- genie/validate.py | 83 ++++++++++++++++-- genie_registry/clinical.py | 7 +- tests/test_clinical.py | 52 +++++++++++ tests/test_validate.py | 171 ++++++++++++++++++++++++++++++++++++- 5 files changed, 306 insertions(+), 15 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 86ebd66f..96475f19 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -125,12 +125,14 @@ patch.object(MODULE_NAME, "FUNCTION_TO_MOCK_NAME".return_value=SOME_RETURN_VALUE Follow gitflow best practices as linked above. 1. Always merge all new features into `develop` branch first (unless it is a documentation, readme, or github action patch into `main`) -1. After initial features are ready in the `develop` branch, create a `release-X.X` branch to prepare for the release. +1. After initial features are ready in the `develop` branch, create a `release-X.X` branch (do not need to push this branch to remote) to prepare for the release. 1. update the `__version__` parameter in `genie/__init__.py` 1. Merge `release-X.X` branch into `main` - Not by pull request! -1. Create release tag (`v...`) and include release notes. Also include any known bugs for each release here. +1. Create release tag (`v...`) and a brief message +1. Push tag and change(s) from `main` +1. Create a new release on the repo. Include release notes. Also include any known bugs for each release here. Wait for the CI/CD to finish. 1. Merge `main` back into `develop` - +1. Push `develop` ### DockerHub diff --git a/genie/validate.py b/genie/validate.py index 1b87102e..10d8f586 100644 --- a/genie/validate.py +++ b/genie/validate.py @@ -255,28 +255,48 @@ def _perform_validate(syn, args): def parse_file_info_in_nested_list( - nested_list: List[List[synapseclient.Entity]], search_str: str + nested_list: List[List[synapseclient.Entity]], + search_str: str, + ignore_case: bool = False, + allow_underscore: bool = False, ) -> dict: - """Parses for a name and filepath in a nested list of Synapse entity objects + """ + TODO: To refactor/remove once clinical file structure gets updated to + be non-nested + + Parses for a name and filepath in a nested list of Synapse entity objects Args: nested_list (list[list]): _description_ search_str (str): the substring to look for in the files + ignore_case (bool, optional): whether to perform case-insensitive comparison. + allow_underscore (bool, optional): whether to treat underscores as equivalent to dashes. Returns: dict[dict]: files found, name(s) and filepath(s) of the file(s) found """ file_info = {} - files = { + all_files = { file["name"]: file["path"] for files in nested_list for file in files - if file["name"].startswith(search_str) + if standardize_string_for_validation( + input_string=file["name"], + ignore_case=ignore_case, + allow_underscore=allow_underscore, + ).startswith( + standardize_string_for_validation( + input_string=search_str, + ignore_case=ignore_case, + allow_underscore=allow_underscore, + ) + ) } - file_info["name"] = ",".join(files.keys()) - file_info["path"] = list(files.values()) # type: ignore[assignment] - return {"files": files, "file_info": file_info} + + file_info["name"] = ",".join(all_files.keys()) + file_info["path"] = list(all_files.values()) # type: ignore[assignment] + return {"files": all_files, "file_info": file_info} def check_values_between_two_df( @@ -286,6 +306,8 @@ def check_values_between_two_df( df2: pd.DataFrame, df2_filename: str, df2_id_to_check: str, + ignore_case: bool = False, + allow_underscore: bool = False, ) -> tuple: """Check that all the identifier(s) (ids) in one file (df1) exists in the other file (df1) @@ -297,6 +319,8 @@ def check_values_between_two_df( df2 (pd.DataFrame): file to cross-validate against df2_filename (str): filename of file to cross-validate against df2_id_to_check (str): name of column to check values for in df2 + ignore_case (bool, optional): whether to perform case-insensitive comparison. + allow_underscore (bool, optional): whether to treat underscores as equivalent to dashes. Returns: tuple: The errors and warnings as a file from cross-validation. @@ -309,8 +333,26 @@ def check_values_between_two_df( df1.columns = [col.upper() for col in df1.columns] df2.columns = [col.upper() for col in df2.columns] + # standardize string values + df1_values = [ + standardize_string_for_validation( + input_string=val, + ignore_case=ignore_case, + allow_underscore=allow_underscore, + ) + for val in df1[df1_id_to_check] + ] + df2_values = [ + standardize_string_for_validation( + input_string=val, + ignore_case=ignore_case, + allow_underscore=allow_underscore, + ) + for val in df2[df2_id_to_check] + ] + # check to see if df1 ids are present in df2 - if not set(df1[df1_id_to_check]) <= set(df2[df2_id_to_check]): + if not set(df1_values) <= set(df2_values): errors = ( f"At least one {df1_id_to_check} in your {df1_filename} file " f"does not exist as a {df2_id_to_check} in your {df2_filename} file. " @@ -348,3 +390,28 @@ def check_variant_start_and_end_positions( "position discrepancy will show a blank reference and variant allele.\n" ) return errors, warnings + + +def standardize_string_for_validation( + input_string: str, ignore_case: bool = False, allow_underscore: bool = False +) -> str: + """This standardizes a string to prep it for further validation purposes + e.g: string comparison + + Args: + input_string (str): input string to standardize + ignore_case (bool, optional): Lowercases the string perform case-insensitive comparison. Defaults to False. + allow_underscore (bool, optional): Treats underscores as equivalent to dashes. Defaults to False. + + Returns: + str: standardized string + """ + if isinstance(input_string, str): + standardized_str = input_string + if ignore_case: + standardized_str = standardized_str.lower() + if allow_underscore: + standardized_str = standardized_str.replace("_", "-") + return standardized_str + else: + return input_string diff --git a/genie_registry/clinical.py b/genie_registry/clinical.py index f4b8c964..014b0b55 100644 --- a/genie_registry/clinical.py +++ b/genie_registry/clinical.py @@ -1021,7 +1021,10 @@ def _cross_validate_bed_files_exist(self, clinicaldf) -> tuple: for seq_assay_id in seq_assay_ids: bed_files = validate.parse_file_info_in_nested_list( - nested_list=self.ancillary_files, search_str=f"{seq_assay_id}.bed" # type: ignore[arg-type] + nested_list=self.ancillary_files, + search_str=f"{seq_assay_id}.bed", # type: ignore[arg-type] + ignore_case=True, + allow_underscore=True, ) if not bed_files["files"]: missing_files.append(f"{seq_assay_id}.bed") @@ -1074,6 +1077,8 @@ def _cross_validate_assay_info_has_seq(self, clinicaldf: pd.DataFrame) -> tuple: df2=assay_df, df2_filename="assay information", df2_id_to_check=col_to_validate, + ignore_case=True, + allow_underscore=True, ) return errors, warnings diff --git a/tests/test_clinical.py b/tests/test_clinical.py index 9fa555dd..92c4ec30 100644 --- a/tests/test_clinical.py +++ b/tests/test_clinical.py @@ -1122,6 +1122,28 @@ def test_that_cross_validate_bed_files_exist_returns_correct_msgs( assert warnings == expected_warning +def test_that_cross_validate_bed_files_exist_calls_expected_methods(clin_class): + test_clinical_df = pd.DataFrame({"SEQ_ASSAY_ID": ["SAGE-SAGE-1"]}) + test_ancillary_files = ( + [ + [{"name": "SAGE-SAGE-1.bed", "path": ""}], + ], + ) + with mock.patch.object( + validate, + "parse_file_info_in_nested_list", + return_value={"files": "SAGE-1-1.bed"}, + ) as patch_parse_file_info: + clin_class.ancillary_files = test_ancillary_files + clin_class._cross_validate_bed_files_exist(test_clinical_df) + patch_parse_file_info.assert_called_once_with( + nested_list=clin_class.ancillary_files, + search_str="SAGE-SAGE-1.bed", + ignore_case=True, + allow_underscore=True, + ) + + def test_that__cross_validate_calls_expected_methods(clin_class): with mock.patch.object( Clinical, "_cross_validate_assay_info_has_seq", return_value=("", "") @@ -1211,6 +1233,36 @@ def test_that__cross_validate_assay_info_has_seq_does_not_call_check_values_if_i patch_check_values.assert_not_called() +def test_that__cross_validate_assay_info_has_seq_calls_check_values_between_two_df( + clin_class, +): + with mock.patch.object( + validate, + "parse_file_info_in_nested_list", + return_value={"files": {"some_file"}, "file_info": {"name": "", "path": ""}}, + ) as patch_assay_files, mock.patch.object( + process_functions, + "get_assay_dataframe", + return_value=None, + ) as patch_get_df, mock.patch.object( + process_functions, "checkColExist", return_value=True + ) as patch_check_col_exist, mock.patch.object( + validate, "check_values_between_two_df", return_value=("", "") + ) as patch_check_values: + input_df = None + clin_class._cross_validate_assay_info_has_seq(clinicaldf=input_df) + patch_check_values.assert_called_once_with( + df1=None, + df1_filename="clinical file", + df1_id_to_check="SEQ_ASSAY_ID", + df2=None, + df2_filename="assay information", + df2_id_to_check="SEQ_ASSAY_ID", + ignore_case=True, + allow_underscore=True, + ) + + @pytest.mark.parametrize( "test_assay_df,expected_error,expected_warning", [ diff --git a/tests/test_validate.py b/tests/test_validate.py index f1eb84b7..3589166b 100644 --- a/tests/test_validate.py +++ b/tests/test_validate.py @@ -507,14 +507,91 @@ def test_that_parse_file_info_in_nested_list_returns_expected( assert result == expected +def test_that_parse_file_info_in_nested_list_returns_expected_with_ignore_case(): + test_nested_list = [ + [ + {"name": "TEST-file1-1", "path": "/some_path1_1.txt"}, + {"name": "test-file1-2", "path": "/some_path1_2.txt"}, + ], + [{"name": "test_file2", "path": "/some_path2.txt"}], + ] + expected = { + "files": { + "TEST-file1-1": "/some_path1_1.txt", + "test-file1-2": "/some_path1_2.txt", + }, + "file_info": { + "name": "TEST-file1-1,test-file1-2", + "path": ["/some_path1_1.txt", "/some_path1_2.txt"], + }, + } + result = validate.parse_file_info_in_nested_list( + nested_list=test_nested_list, search_str="tEsT-file1", ignore_case=True + ) + assert result == expected + + +def test_that_parse_file_info_in_nested_list_returns_expected_with_allow_underscore(): + test_nested_list = [ + [ + {"name": "test_file1_1", "path": "/some_path1_1.txt"}, + {"name": "test-file1_2", "path": "/some_path1_2.txt"}, + ], + [{"name": "test_file2", "path": "/some_path2.txt"}], + ] + expected = { + "files": { + "test_file1_1": "/some_path1_1.txt", + "test-file1_2": "/some_path1_2.txt", + }, + "file_info": { + "name": "test_file1_1,test-file1_2", + "path": ["/some_path1_1.txt", "/some_path1_2.txt"], + }, + } + result = validate.parse_file_info_in_nested_list( + nested_list=test_nested_list, search_str="test_file1", allow_underscore=True + ) + assert result == expected + + +def test_that_parse_file_info_in_nested_list_returns_expected_with_ignore_case_and_allow_underscore(): + test_nested_list = [ + [ + {"name": "TEST-file1_1", "path": "/some_path1_1.txt"}, + {"name": "test_fiLE1_2", "path": "/some_path1_2.txt"}, + ], + [{"name": "test_file2", "path": "/some_path2.txt"}], + ] + expected = { + "files": { + "TEST-file1_1": "/some_path1_1.txt", + "test_fiLE1_2": "/some_path1_2.txt", + }, + "file_info": { + "name": "TEST-file1_1,test_fiLE1_2", + "path": ["/some_path1_1.txt", "/some_path1_2.txt"], + }, + } + result = validate.parse_file_info_in_nested_list( + nested_list=test_nested_list, + search_str="TEST_file1", + ignore_case=True, + allow_underscore=True, + ) + assert result == expected + + @pytest.mark.parametrize( - "test_df1,test_df2,expected_errors,expected_warnings", + "test_df1,test_df2,expected_errors,expected_warnings,ignore_case,allow_underscore", [ ( pd.DataFrame({"ID": [1, 2]}), pd.DataFrame({"ID2": [1, 2, 3]}), "", "", + False, + False, ), ( pd.DataFrame({"ID": [1, 2, 3, 4]}), @@ -522,6 +599,8 @@ def test_that_parse_file_info_in_nested_list_returns_expected( "At least one ID in your test1 file does not exist as a ID2 in your test2 file. " "Please update your file(s) to be consistent.\n", "", + False, + False, ), ( pd.DataFrame({"ID": [3, 4, 5]}), @@ -529,12 +608,70 @@ def test_that_parse_file_info_in_nested_list_returns_expected( "At least one ID in your test1 file does not exist as a ID2 in your test2 file. " "Please update your file(s) to be consistent.\n", "", + False, + False, + ), + ( + pd.DataFrame({"ID": ["TEST1", "TEST2", "TeSt3"]}), + pd.DataFrame({"ID2": ["test1", "test2", "TEST3"]}), + "", + "", + True, + False, + ), + ( + pd.DataFrame({"ID": ["TEST_1", "TEST_2", "TEST-3"]}), + pd.DataFrame({"ID2": ["TEST_1", "TEST-2", "TEST_3"]}), + "", + "", + False, + True, + ), + ( + pd.DataFrame({"ID": ["TEST_1", "TEST_2", "TeSt-3"]}), + pd.DataFrame({"ID2": ["test_1", "test-2", "TEST_3"]}), + "", + "", + True, + True, + ), + ( + pd.DataFrame({"ID": ["TEST__1", "TEST_2", "TeSt-3"]}), + pd.DataFrame({"ID2": ["test_1", "test-2", "TEST_3"]}), + "At least one ID in your test1 file does not exist as a ID2 in your test2 file. " + "Please update your file(s) to be consistent.\n", + "", + True, + True, + ), + ( + pd.DataFrame({"ID": ["TEST1", "TEST2", "TeSt3"]}), + pd.DataFrame({"ID2": ["test1", "test2", "TEST3"]}), + "At least one ID in your test1 file does not exist as a ID2 in your test2 file. " + "Please update your file(s) to be consistent.\n", + "", + False, + False, ), ], - ids=["all_match", "some_match", "no_match"], + ids=[ + "all_match", + "some_match", + "no_match", + "ignore_case", + "allow_underscore", + "ignore_case_and_allow_underscore", + "str_some_match", + "str_no_match", + ], ) def test_that_check_values_between_two_df_returns_expected( - test_df1, test_df2, expected_errors, expected_warnings + test_df1, + test_df2, + expected_errors, + expected_warnings, + ignore_case, + allow_underscore, ): errors, warnings = validate.check_values_between_two_df( df1=test_df1, @@ -543,6 +680,8 @@ def test_that_check_values_between_two_df_returns_expected( df2=test_df2, df2_filename="test2", df2_id_to_check="ID2", + ignore_case=ignore_case, + allow_underscore=allow_underscore, ) assert errors == expected_errors assert warnings == expected_warnings @@ -603,3 +742,29 @@ def test_that_check_variant_start_and_end_positions_returns_expected( ) assert errors == expected_errors assert warnings == expected_warnings + + +@pytest.mark.parametrize( + "input_str,expected,ignore_case,allow_underscore", + [ + ("SAGe-1", "sage-1", True, False), + ("SAGe_1", "SAGe-1", False, True), + ("SAGe_1", "sage-1", True, True), + (120, 120, True, True), + ], + ids=[ + "ignore_case", + "allow_underscore", + "ignore_case_and_allow_underscore", + "non_str", + ], +) +def test_that_standardize_string_for_validation_returns_expected( + input_str, expected, ignore_case, allow_underscore +): + test_str = validate.standardize_string_for_validation( + input_string=input_str, + ignore_case=ignore_case, + allow_underscore=allow_underscore, + ) + assert test_str == expected