Skip to content

Commit

Permalink
[GEN-846] Ignore case and allow underscores in cross-validate (#536)
Browse files Browse the repository at this point in the history
* add initial standardize string function for bed, clinical and assay cross validation
* add remaining tests for clinical checks against maf and bed with ignore case and allow underscore
* add to contributing guidelines for release

---------

Co-authored-by: Rixing Xu <[email protected]>
  • Loading branch information
rxu17 and Rixing Xu authored Sep 18, 2023
1 parent 7c1e5a5 commit db442a2
Show file tree
Hide file tree
Showing 5 changed files with 306 additions and 15 deletions.
8 changes: 5 additions & 3 deletions CONTRIBUTING.md
Original file line number Diff line number Diff line change
Expand Up @@ -125,12 +125,14 @@ patch.object(MODULE_NAME, "FUNCTION_TO_MOCK_NAME".return_value=SOME_RETURN_VALUE
Follow gitflow best practices as linked above.

1. Always merge all new features into `develop` branch first (unless it is a documentation, readme, or github action patch into `main`)
1. After initial features are ready in the `develop` branch, create a `release-X.X` branch to prepare for the release.
1. After initial features are ready in the `develop` branch, create a `release-X.X` branch (do not need to push this branch to remote) to prepare for the release.
1. update the `__version__` parameter in `genie/__init__.py`
1. Merge `release-X.X` branch into `main` - Not by pull request!
1. Create release tag (`v...`) and include release notes. Also include any known bugs for each release here.
1. Create release tag (`v...`) and a brief message
1. Push tag and change(s) from `main`
1. Create a new release on the repo. Include release notes. Also include any known bugs for each release here. Wait for the CI/CD to finish.
1. Merge `main` back into `develop`

1. Push `develop`

### DockerHub

Expand Down
83 changes: 75 additions & 8 deletions genie/validate.py
Original file line number Diff line number Diff line change
Expand Up @@ -255,28 +255,48 @@ def _perform_validate(syn, args):


def parse_file_info_in_nested_list(
nested_list: List[List[synapseclient.Entity]], search_str: str
nested_list: List[List[synapseclient.Entity]],
search_str: str,
ignore_case: bool = False,
allow_underscore: bool = False,
) -> dict:
"""Parses for a name and filepath in a nested list of Synapse entity objects
"""
TODO: To refactor/remove once clinical file structure gets updated to
be non-nested
Parses for a name and filepath in a nested list of Synapse entity objects
Args:
nested_list (list[list]): _description_
search_str (str): the substring to look for in the files
ignore_case (bool, optional): whether to perform case-insensitive comparison.
allow_underscore (bool, optional): whether to treat underscores as equivalent to dashes.
Returns:
dict[dict]: files found,
name(s) and filepath(s) of the file(s) found
"""
file_info = {}
files = {
all_files = {
file["name"]: file["path"]
for files in nested_list
for file in files
if file["name"].startswith(search_str)
if standardize_string_for_validation(
input_string=file["name"],
ignore_case=ignore_case,
allow_underscore=allow_underscore,
).startswith(
standardize_string_for_validation(
input_string=search_str,
ignore_case=ignore_case,
allow_underscore=allow_underscore,
)
)
}
file_info["name"] = ",".join(files.keys())
file_info["path"] = list(files.values()) # type: ignore[assignment]
return {"files": files, "file_info": file_info}

file_info["name"] = ",".join(all_files.keys())
file_info["path"] = list(all_files.values()) # type: ignore[assignment]
return {"files": all_files, "file_info": file_info}


def check_values_between_two_df(
Expand All @@ -286,6 +306,8 @@ def check_values_between_two_df(
df2: pd.DataFrame,
df2_filename: str,
df2_id_to_check: str,
ignore_case: bool = False,
allow_underscore: bool = False,
) -> tuple:
"""Check that all the identifier(s) (ids) in one
file (df1) exists in the other file (df1)
Expand All @@ -297,6 +319,8 @@ def check_values_between_two_df(
df2 (pd.DataFrame): file to cross-validate against
df2_filename (str): filename of file to cross-validate against
df2_id_to_check (str): name of column to check values for in df2
ignore_case (bool, optional): whether to perform case-insensitive comparison.
allow_underscore (bool, optional): whether to treat underscores as equivalent to dashes.
Returns:
tuple: The errors and warnings as a file from cross-validation.
Expand All @@ -309,8 +333,26 @@ def check_values_between_two_df(
df1.columns = [col.upper() for col in df1.columns]
df2.columns = [col.upper() for col in df2.columns]

# standardize string values
df1_values = [
standardize_string_for_validation(
input_string=val,
ignore_case=ignore_case,
allow_underscore=allow_underscore,
)
for val in df1[df1_id_to_check]
]
df2_values = [
standardize_string_for_validation(
input_string=val,
ignore_case=ignore_case,
allow_underscore=allow_underscore,
)
for val in df2[df2_id_to_check]
]

# check to see if df1 ids are present in df2
if not set(df1[df1_id_to_check]) <= set(df2[df2_id_to_check]):
if not set(df1_values) <= set(df2_values):
errors = (
f"At least one {df1_id_to_check} in your {df1_filename} file "
f"does not exist as a {df2_id_to_check} in your {df2_filename} file. "
Expand Down Expand Up @@ -348,3 +390,28 @@ def check_variant_start_and_end_positions(
"position discrepancy will show a blank reference and variant allele.\n"
)
return errors, warnings


def standardize_string_for_validation(
input_string: str, ignore_case: bool = False, allow_underscore: bool = False
) -> str:
"""This standardizes a string to prep it for further validation purposes
e.g: string comparison
Args:
input_string (str): input string to standardize
ignore_case (bool, optional): Lowercases the string perform case-insensitive comparison. Defaults to False.
allow_underscore (bool, optional): Treats underscores as equivalent to dashes. Defaults to False.
Returns:
str: standardized string
"""
if isinstance(input_string, str):
standardized_str = input_string
if ignore_case:
standardized_str = standardized_str.lower()
if allow_underscore:
standardized_str = standardized_str.replace("_", "-")
return standardized_str
else:
return input_string
7 changes: 6 additions & 1 deletion genie_registry/clinical.py
Original file line number Diff line number Diff line change
Expand Up @@ -1021,7 +1021,10 @@ def _cross_validate_bed_files_exist(self, clinicaldf) -> tuple:

for seq_assay_id in seq_assay_ids:
bed_files = validate.parse_file_info_in_nested_list(
nested_list=self.ancillary_files, search_str=f"{seq_assay_id}.bed" # type: ignore[arg-type]
nested_list=self.ancillary_files,
search_str=f"{seq_assay_id}.bed", # type: ignore[arg-type]
ignore_case=True,
allow_underscore=True,
)
if not bed_files["files"]:
missing_files.append(f"{seq_assay_id}.bed")
Expand Down Expand Up @@ -1074,6 +1077,8 @@ def _cross_validate_assay_info_has_seq(self, clinicaldf: pd.DataFrame) -> tuple:
df2=assay_df,
df2_filename="assay information",
df2_id_to_check=col_to_validate,
ignore_case=True,
allow_underscore=True,
)
return errors, warnings

Expand Down
52 changes: 52 additions & 0 deletions tests/test_clinical.py
Original file line number Diff line number Diff line change
Expand Up @@ -1122,6 +1122,28 @@ def test_that_cross_validate_bed_files_exist_returns_correct_msgs(
assert warnings == expected_warning


def test_that_cross_validate_bed_files_exist_calls_expected_methods(clin_class):
test_clinical_df = pd.DataFrame({"SEQ_ASSAY_ID": ["SAGE-SAGE-1"]})
test_ancillary_files = (
[
[{"name": "SAGE-SAGE-1.bed", "path": ""}],
],
)
with mock.patch.object(
validate,
"parse_file_info_in_nested_list",
return_value={"files": "SAGE-1-1.bed"},
) as patch_parse_file_info:
clin_class.ancillary_files = test_ancillary_files
clin_class._cross_validate_bed_files_exist(test_clinical_df)
patch_parse_file_info.assert_called_once_with(
nested_list=clin_class.ancillary_files,
search_str="SAGE-SAGE-1.bed",
ignore_case=True,
allow_underscore=True,
)


def test_that__cross_validate_calls_expected_methods(clin_class):
with mock.patch.object(
Clinical, "_cross_validate_assay_info_has_seq", return_value=("", "")
Expand Down Expand Up @@ -1211,6 +1233,36 @@ def test_that__cross_validate_assay_info_has_seq_does_not_call_check_values_if_i
patch_check_values.assert_not_called()


def test_that__cross_validate_assay_info_has_seq_calls_check_values_between_two_df(
clin_class,
):
with mock.patch.object(
validate,
"parse_file_info_in_nested_list",
return_value={"files": {"some_file"}, "file_info": {"name": "", "path": ""}},
) as patch_assay_files, mock.patch.object(
process_functions,
"get_assay_dataframe",
return_value=None,
) as patch_get_df, mock.patch.object(
process_functions, "checkColExist", return_value=True
) as patch_check_col_exist, mock.patch.object(
validate, "check_values_between_two_df", return_value=("", "")
) as patch_check_values:
input_df = None
clin_class._cross_validate_assay_info_has_seq(clinicaldf=input_df)
patch_check_values.assert_called_once_with(
df1=None,
df1_filename="clinical file",
df1_id_to_check="SEQ_ASSAY_ID",
df2=None,
df2_filename="assay information",
df2_id_to_check="SEQ_ASSAY_ID",
ignore_case=True,
allow_underscore=True,
)


@pytest.mark.parametrize(
"test_assay_df,expected_error,expected_warning",
[
Expand Down
Loading

0 comments on commit db442a2

Please sign in to comment.