[GEN-1021] update library strategy (#580)

* update valid library_strategy values * comment unwanted kwargs out * introduce row specific validation
Sage-Bionetworks · Nov 14, 2024 · a757ad7 · a757ad7
1 parent 42b5ff8
commit a757ad7
Show file tree

Hide file tree

Showing 4 changed files with 384 additions and 38 deletions.
diff --git a/genie/process_functions.py b/genie/process_functions.py
@@ -980,3 +980,118 @@ def create_missing_columns(dataset: pd.DataFrame, schema: dict) -> pd.Series:
         elif data_type == "boolean":
             dataset[column] = dataset[column].astype(pd.BooleanDtype())
     return dataset[list(schema.keys())]
+
+
+def get_row_indices_for_invalid_column_values(
+    df: pd.DataFrame,
+    col: str,
+    possible_values: list,
+    na_allowed: bool = False,
+    sep: Optional[str] = None,
+) -> pd.Index:
+    """This function checks the column values against possible_values and returns row indices of invalid rows.
+
+    Args:
+        df (pd.DataFrame): Input dataframe
+        col (str): The column to be checked
+        possible_values (list): The list of possible values
+        na_allowed (bool, optional): If NA is allowed. Defaults to False.
+        sep (Optional[str], optional): The string separator. Defaults to None.
+
+    Returns:
+        pd.Index: The row indices of the rows with values that are not in possible_values.
+    """
+    if na_allowed:
+        # this is only useful for dropping NAs for individual values rather than value_list
+        check_values = df[col].dropna()
+    else:
+        check_values = df[col]
+    if sep:
+        # for columns contain lists of values
+        check_values = check_values.apply(
+            lambda x: all(substring in possible_values for substring in x.split(sep))
+        )
+    else:
+        check_values = check_values.apply(lambda x: x in possible_values)
+    return check_values[check_values == False].index
+
+
+def get_message_for_invalid_column_value(
+    col: str, filename: str, invalid_indices: pd.Index, possible_values: list
+) -> tuple:
+    """This function returns the error and warning messages if the target column has rows with invalid values.
+
+    Args:
+        col (str): The column to be checked
+        filename (str): The file name
+        invalid_indices (pd.Index): The row indices of the rows with invalid values
+        possible_values (list): The list of possible values
+
+    Returns:
+        tuple: warning, error
+    """
+    warning = ""
+    error = ""
+    # check the validity of values in the column
+    # concatenated possible values. This is done because of pandas typing. An integer column with one NA/blank value will be cast as a double.
+    possible_values = ", ".join(
+        [str(value).replace(".0", "") for value in possible_values]
+    )
+    if len(invalid_indices) > 0:
+        error = (
+            f"{filename}: Please double check your {col} column. Valid values are {possible_values}. "
+            f"You have {len(invalid_indices)} row(s) in your file where {col} column contains invalid values. "
+            f"The row(s) this occurs in are: {invalid_indices.tolist()}. Please correct.\n"
+        )
+    return (warning, error)
+
+
+def check_column_and_values_row_specific(
+    df: pd.DataFrame,
+    col: str,
+    possible_values: list,
+    filename: str,
+    na_allowed: bool = False,
+    required: bool = False,
+    sep: Optional[str] = None,
+) -> tuple:
+    """This function checks if the column exists and checks if the values in the column have the valid values.
+       Currently, this function is only used in assay.py
+
+    Args:
+        df (pd.DataFrame): Input dataframe
+        col (str): The column to be checked
+        possible_values (list): The list of possible values
+        filename (str): The file name
+        na_allowed (bool, optional): If NA is allowed. Defaults to False.
+        required (bool, optional): If the column is required. Defaults to False.
+        sep (Optional[str], optional): The string separator. Defaults to None.
+
+    Returns:
+        tuple: warning, error
+    """
+    warning = ""
+    error = ""
+    # check the existence of the column
+    have_column = checkColExist(df, col)
+    if not have_column:
+        if required:
+            error = "{filename}: Must have {col} column.\n".format(
+                filename=filename, col=col
+            )
+        else:
+            warning = (
+                "{filename}: Doesn't have {col} column. "
+                "This column will be added.\n".format(filename=filename, col=col)
+            )
+    else:
+        # get the row indices
+        invalid_indices = get_row_indices_for_invalid_column_values(
+            df, col, possible_values, na_allowed, sep
+        )
+        # generate validation message
+        warning, error = get_message_for_invalid_column_value(
+            col, filename, invalid_indices, possible_values
+        )
+
+    return (warning, error)
diff --git a/genie_registry/assay.py b/genie_registry/assay.py
@@ -1,12 +1,11 @@
 """Assay information class"""
 
 import os
-import yaml
 
 import pandas as pd
-
-from genie.example_filetype_format import FileTypeFormat
+import yaml
 from genie import extract, load, process_functions
+from genie.example_filetype_format import FileTypeFormat
 
 
 class Assayinfo(FileTypeFormat):
@@ -16,7 +15,7 @@ class Assayinfo(FileTypeFormat):
 
     _process_kwargs = ["newPath", "databaseSynId"]
 
-    _validation_kwargs = ["project_id"]
+    # _validation_kwargs = ["project_id"]
 
     def _validateFilename(self, filepath_list):
         """Validate assay information filename"""
@@ -128,7 +127,7 @@ def _get_dataframe(self, filepath_list):
             all_panel_info = pd.concat([all_panel_info, assay_finaldf])
         return all_panel_info
 
-    def _validate(self, assay_info_df, project_id):
+    def _validate(self, assay_info_df):
         """
         Validates the values of assay information file
 
@@ -202,7 +201,7 @@ def _validate(self, assay_info_df, project_id):
         warn, error = process_functions.check_col_and_values(
             assay_info_df,
             "library_strategy",
-            read_group_headers["library_strategy"]["enum"],
+            ["Targeted Sequencing", "WXS"],
             filename="Assay_information.yaml",
             required=True,
         )
@@ -231,16 +230,6 @@ def _validate(self, assay_info_df, project_id):
         warning += warn
         total_error += error
 
-        # target_capture_kit = read_group_headers['target_capture_kit']['enum']
-        # warn, error = process_functions.check_col_and_values(
-        #     assay_info_df,
-        #     'target_capture_kit',
-        #     target_capture_kit,
-        #     filename="Assay_information.yaml",
-        #     required=True)
-        # warning += warn
-        # total_error += error
-
         if not process_functions.checkColExist(assay_info_df, "target_capture_kit"):
             total_error += (
                 "Assay_information.yaml: " "Must have target_capture_kit column.\n"

diff --git a/tests/test_assay.py b/tests/test_assay.py
@@ -5,9 +5,8 @@
 
 import pandas as pd
 import pytest
-
-from genie_registry.assay import Assayinfo
 from genie import extract, process_functions
+from genie_registry.assay import Assayinfo
 
 GDC_DATA_DICT = {
     "properties": {
@@ -45,7 +44,7 @@ def test_validinput__validate(assay_info):
     assay_info_dict = {
         "SEQ_ASSAY_ID": ["SAGE-1", "SAGE-3"],
         "is_paired_end": [True, False],
-        "library_strategy": ["value1", "value2"],
+        "library_strategy": ["Targeted Sequencing", "WXS"],
         "library_selection": ["value1", "value2"],
         "platform": ["value1", "value2"],
         "instrument_model": ["value1", "value2"],
@@ -68,18 +67,18 @@ def test_validinput__validate(assay_info):
     ), patch.object(
         process_functions, "get_gdc_data_dictionary", return_value=test_dict
     ) as patch_get_gdc:
-        error, warning = assay_info._validate(assay_info_df, "syn9999")
+        error, warning = assay_info._validate(assay_info_df)
         assert error == ""
         assert warning == ""
         patch_get_gdc.assert_called()
 
 
 def test_case__validate(assay_info):
-    """Valid input should have no errors or warnings"""
+    """Valid input with lowercase SEQ_ASSAY_ID, should have no errors or warnings"""
     assay_info_dict = {
         "SEQ_ASSAY_ID": ["sage-1", "SAGE-3"],
         "is_paired_end": [True, False],
-        "library_strategy": ["value1", "value2"],
+        "library_strategy": ["Targeted Sequencing", "WXS"],
         "library_selection": ["value1", "value2"],
         "platform": ["value1", "value2"],
         "instrument_model": ["value1", "value2"],
@@ -102,18 +101,18 @@ def test_case__validate(assay_info):
     ), patch.object(
         process_functions, "get_gdc_data_dictionary", return_value=test_dict
     ) as patch_get_gdc:
-        error, warning = assay_info._validate(assay_info_df, "syn9999")
+        error, warning = assay_info._validate(assay_info_df)
         assert error == ""
         assert warning == ""
         patch_get_gdc.assert_called()
 
 
 def test_underscore__validate(assay_info):
-    """Valid input should have no errors or warnings"""
+    """Valid input with underscore in SEQ_ASSAY_ID, should have no errors or warnings"""
     assay_info_dict = {
         "SEQ_ASSAY_ID": ["SAGE_1", "SAGE-3"],
         "is_paired_end": [True, False],
-        "library_strategy": ["value1", "value2"],
+        "library_strategy": ["Targeted Sequencing", "WXS"],
         "library_selection": ["value1", "value2"],
         "platform": ["value1", "value2"],
         "instrument_model": ["value1", "value2"],
@@ -136,7 +135,7 @@ def test_underscore__validate(assay_info):
     ), patch.object(
         process_functions, "get_gdc_data_dictionary", return_value=test_dict
     ) as patch_get_gdc:
-        error, warning = assay_info._validate(assay_info_df, "syn9999")
+        error, warning = assay_info._validate(assay_info_df)
         assert error == ""
         assert warning == ""
         patch_get_gdc.assert_called()
@@ -149,7 +148,7 @@ def test__missingcols__validate(assay_info):
     with patch.object(
         process_functions, "get_gdc_data_dictionary", return_value=test_dict
     ) as patch_get_gdc:
-        error, warning = assay_info._validate(assay_info_df, "syn99999")
+        error, warning = assay_info._validate(assay_info_df)
     expected_errors = (
         "Assay_information.yaml: Must have SEQ_ASSAY_ID column.\n"
         "Assay_information.yaml: Must have is_paired_end column.\n"
@@ -230,7 +229,7 @@ def test_invalid__validate(assay_info):
     assay_info_dict = {
         "SEQ_ASSAY_ID": ["SAGE-1", "SAG-2"],
         "is_paired_end": [True, "foo"],
-        "library_strategy": ["foo", "ChIP-Seq"],
+        "library_strategy": ["foo", "WXS"],
         "library_selection": ["foo", "PCR"],
         "platform": ["foo", "Illumina"],
         "instrument_model": ["foo", "Illumina HiSeq 4000"],
@@ -256,7 +255,7 @@ def test_invalid__validate(assay_info):
     ), patch.object(
         process_functions, "get_gdc_data_dictionary", return_value=test_dict
     ) as patch_get_gdc:
-        error, warning = assay_info._validate(assay_info_df, "syn9999")
+        error, warning = assay_info._validate(assay_info_df)
         expected_errors = (
             "Assay_information.yaml: "
             "Please make sure all your SEQ_ASSAY_IDs start with your "
@@ -270,7 +269,7 @@ def test_invalid__validate(assay_info):
             "This column must only be these values: value1, value2\n"
             "Assay_information.yaml: "
             "Please double check your library_strategy column.  "
-            "This column must only be these values: value1, value2\n"
+            "This column must only be these values: Targeted Sequencing, WXS\n"
             "Assay_information.yaml: "
             "Please double check your platform column.  "
             "This column must only be these values: value1, value2\n"