[GEN-846] Ignore case and allow underscores in cross-validate (#536)

* add initial standardize string function for bed, clinical and assay cross validation * add remaining tests for clinical checks against maf and bed with ignore case and allow underscore * add to contributing guidelines for release --------- Co-authored-by: Rixing Xu <[email protected]>
Sage-Bionetworks · Sep 18, 2023 · db442a2 · db442a2
1 parent 7c1e5a5
commit db442a2
Show file tree

Hide file tree

Showing 5 changed files with 306 additions and 15 deletions.
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -125,12 +125,14 @@ patch.object(MODULE_NAME, "FUNCTION_TO_MOCK_NAME".return_value=SOME_RETURN_VALUE
 Follow gitflow best practices as linked above.
 
 1. Always merge all new features into `develop` branch first (unless it is a documentation, readme, or github action patch into `main`)
-1. After initial features are ready in the `develop` branch, create a `release-X.X` branch to prepare for the release.
+1. After initial features are ready in the `develop` branch, create a `release-X.X` branch (do not need to push this branch to remote) to prepare for the release.
     1. update the `__version__` parameter in `genie/__init__.py`
 1. Merge `release-X.X` branch into `main` - Not by pull request!
-1. Create release tag (`v...`) and include release notes.  Also include any known bugs for each release here.
+1. Create release tag (`v...`) and a brief message
+1. Push tag and change(s) from `main`
+1. Create a new release on the repo. Include release notes.  Also include any known bugs for each release here. Wait for the CI/CD to finish.
 1. Merge `main` back into `develop`
-
+1. Push `develop`
 
 ### DockerHub
 

diff --git a/genie/validate.py b/genie/validate.py
@@ -255,28 +255,48 @@ def _perform_validate(syn, args):
 
 
 def parse_file_info_in_nested_list(
-    nested_list: List[List[synapseclient.Entity]], search_str: str
+    nested_list: List[List[synapseclient.Entity]],
+    search_str: str,
+    ignore_case: bool = False,
+    allow_underscore: bool = False,
 ) -> dict:
-    """Parses for a name and filepath in a nested list of Synapse entity objects
+    """
+    TODO: To refactor/remove once clinical file structure gets updated to
+    be non-nested
+
+    Parses for a name and filepath in a nested list of Synapse entity objects
 
     Args:
         nested_list (list[list]): _description_
         search_str (str): the substring to look for in the files
+        ignore_case (bool, optional): whether to perform case-insensitive comparison.
+        allow_underscore (bool, optional): whether to treat underscores as equivalent to dashes.
 
     Returns:
         dict[dict]: files found,
             name(s) and filepath(s) of the file(s) found
     """
     file_info = {}
-    files = {
+    all_files = {
         file["name"]: file["path"]
         for files in nested_list
         for file in files
-        if file["name"].startswith(search_str)
+        if standardize_string_for_validation(
+            input_string=file["name"],
+            ignore_case=ignore_case,
+            allow_underscore=allow_underscore,
+        ).startswith(
+            standardize_string_for_validation(
+                input_string=search_str,
+                ignore_case=ignore_case,
+                allow_underscore=allow_underscore,
+            )
+        )
     }
-    file_info["name"] = ",".join(files.keys())
-    file_info["path"] = list(files.values())  # type: ignore[assignment]
-    return {"files": files, "file_info": file_info}
+
+    file_info["name"] = ",".join(all_files.keys())
+    file_info["path"] = list(all_files.values())  # type: ignore[assignment]
+    return {"files": all_files, "file_info": file_info}
 
 
 def check_values_between_two_df(
@@ -286,6 +306,8 @@ def check_values_between_two_df(
     df2: pd.DataFrame,
     df2_filename: str,
     df2_id_to_check: str,
+    ignore_case: bool = False,
+    allow_underscore: bool = False,
 ) -> tuple:
     """Check that all the identifier(s) (ids) in one
     file (df1) exists in the other file (df1)
@@ -297,6 +319,8 @@ def check_values_between_two_df(
         df2 (pd.DataFrame): file to cross-validate against
         df2_filename (str): filename of file to cross-validate against
         df2_id_to_check (str): name of column to check values for in df2
+        ignore_case (bool, optional): whether to perform case-insensitive comparison.
+        allow_underscore (bool, optional): whether to treat underscores as equivalent to dashes.
 
     Returns:
         tuple: The errors and warnings as a file from cross-validation.
@@ -309,8 +333,26 @@ def check_values_between_two_df(
     df1.columns = [col.upper() for col in df1.columns]
     df2.columns = [col.upper() for col in df2.columns]
 
+    # standardize string values
+    df1_values = [
+        standardize_string_for_validation(
+            input_string=val,
+            ignore_case=ignore_case,
+            allow_underscore=allow_underscore,
+        )
+        for val in df1[df1_id_to_check]
+    ]
+    df2_values = [
+        standardize_string_for_validation(
+            input_string=val,
+            ignore_case=ignore_case,
+            allow_underscore=allow_underscore,
+        )
+        for val in df2[df2_id_to_check]
+    ]
+
     # check to see if df1 ids are present in df2
-    if not set(df1[df1_id_to_check]) <= set(df2[df2_id_to_check]):
+    if not set(df1_values) <= set(df2_values):
         errors = (
             f"At least one {df1_id_to_check} in your {df1_filename} file "
             f"does not exist as a {df2_id_to_check} in your {df2_filename} file. "
@@ -348,3 +390,28 @@ def check_variant_start_and_end_positions(
             "position discrepancy will show a blank reference and variant allele.\n"
         )
     return errors, warnings
+
+
+def standardize_string_for_validation(
+    input_string: str, ignore_case: bool = False, allow_underscore: bool = False
+) -> str:
+    """This standardizes a string to prep it for further validation purposes
+        e.g: string comparison
+
+    Args:
+        input_string (str): input string to standardize
+        ignore_case (bool, optional): Lowercases the string perform case-insensitive comparison. Defaults to False.
+        allow_underscore (bool, optional): Treats underscores as equivalent to dashes. Defaults to False.
+
+    Returns:
+        str: standardized string
+    """
+    if isinstance(input_string, str):
+        standardized_str = input_string
+        if ignore_case:
+            standardized_str = standardized_str.lower()
+        if allow_underscore:
+            standardized_str = standardized_str.replace("_", "-")
+        return standardized_str
+    else:
+        return input_string
diff --git a/genie_registry/clinical.py b/genie_registry/clinical.py
@@ -1021,7 +1021,10 @@ def _cross_validate_bed_files_exist(self, clinicaldf) -> tuple:
 
         for seq_assay_id in seq_assay_ids:
             bed_files = validate.parse_file_info_in_nested_list(
-                nested_list=self.ancillary_files, search_str=f"{seq_assay_id}.bed"  # type: ignore[arg-type]
+                nested_list=self.ancillary_files,
+                search_str=f"{seq_assay_id}.bed",  # type: ignore[arg-type]
+                ignore_case=True,
+                allow_underscore=True,
             )
             if not bed_files["files"]:
                 missing_files.append(f"{seq_assay_id}.bed")
@@ -1074,6 +1077,8 @@ def _cross_validate_assay_info_has_seq(self, clinicaldf: pd.DataFrame) -> tuple:
                         df2=assay_df,
                         df2_filename="assay information",
                         df2_id_to_check=col_to_validate,
+                        ignore_case=True,
+                        allow_underscore=True,
                     )
         return errors, warnings
 

diff --git a/tests/test_clinical.py b/tests/test_clinical.py
@@ -1122,6 +1122,28 @@ def test_that_cross_validate_bed_files_exist_returns_correct_msgs(
     assert warnings == expected_warning
 
 
+def test_that_cross_validate_bed_files_exist_calls_expected_methods(clin_class):
+    test_clinical_df = pd.DataFrame({"SEQ_ASSAY_ID": ["SAGE-SAGE-1"]})
+    test_ancillary_files = (
+        [
+            [{"name": "SAGE-SAGE-1.bed", "path": ""}],
+        ],
+    )
+    with mock.patch.object(
+        validate,
+        "parse_file_info_in_nested_list",
+        return_value={"files": "SAGE-1-1.bed"},
+    ) as patch_parse_file_info:
+        clin_class.ancillary_files = test_ancillary_files
+        clin_class._cross_validate_bed_files_exist(test_clinical_df)
+        patch_parse_file_info.assert_called_once_with(
+            nested_list=clin_class.ancillary_files,
+            search_str="SAGE-SAGE-1.bed",
+            ignore_case=True,
+            allow_underscore=True,
+        )
+
+
 def test_that__cross_validate_calls_expected_methods(clin_class):
     with mock.patch.object(
         Clinical, "_cross_validate_assay_info_has_seq", return_value=("", "")
@@ -1211,6 +1233,36 @@ def test_that__cross_validate_assay_info_has_seq_does_not_call_check_values_if_i
         patch_check_values.assert_not_called()
 
 
+def test_that__cross_validate_assay_info_has_seq_calls_check_values_between_two_df(
+    clin_class,
+):
+    with mock.patch.object(
+        validate,
+        "parse_file_info_in_nested_list",
+        return_value={"files": {"some_file"}, "file_info": {"name": "", "path": ""}},
+    ) as patch_assay_files, mock.patch.object(
+        process_functions,
+        "get_assay_dataframe",
+        return_value=None,
+    ) as patch_get_df, mock.patch.object(
+        process_functions, "checkColExist", return_value=True
+    ) as patch_check_col_exist, mock.patch.object(
+        validate, "check_values_between_two_df", return_value=("", "")
+    ) as patch_check_values:
+        input_df = None
+        clin_class._cross_validate_assay_info_has_seq(clinicaldf=input_df)
+        patch_check_values.assert_called_once_with(
+            df1=None,
+            df1_filename="clinical file",
+            df1_id_to_check="SEQ_ASSAY_ID",
+            df2=None,
+            df2_filename="assay information",
+            df2_id_to_check="SEQ_ASSAY_ID",
+            ignore_case=True,
+            allow_underscore=True,
+        )
+
+
 @pytest.mark.parametrize(
     "test_assay_df,expected_error,expected_warning",
     [