From a757ad76e7572124dd2bf94b1e14be00a3c6869a Mon Sep 17 00:00:00 2001 From: Dan Lu <90745557+danlu1@users.noreply.github.com> Date: Thu, 14 Nov 2024 15:54:34 -0800 Subject: [PATCH] [GEN-1021] update library strategy (#580) * update valid library_strategy values * comment unwanted kwargs out * introduce row specific validation --- genie/process_functions.py | 115 ++++++++++++++ genie_registry/assay.py | 21 +-- tests/test_assay.py | 27 ++-- tests/test_process_functions.py | 259 +++++++++++++++++++++++++++++++- 4 files changed, 384 insertions(+), 38 deletions(-) diff --git a/genie/process_functions.py b/genie/process_functions.py index adea2f4f..d00787c7 100644 --- a/genie/process_functions.py +++ b/genie/process_functions.py @@ -980,3 +980,118 @@ def create_missing_columns(dataset: pd.DataFrame, schema: dict) -> pd.Series: elif data_type == "boolean": dataset[column] = dataset[column].astype(pd.BooleanDtype()) return dataset[list(schema.keys())] + + +def get_row_indices_for_invalid_column_values( + df: pd.DataFrame, + col: str, + possible_values: list, + na_allowed: bool = False, + sep: Optional[str] = None, +) -> pd.Index: + """This function checks the column values against possible_values and returns row indices of invalid rows. + + Args: + df (pd.DataFrame): Input dataframe + col (str): The column to be checked + possible_values (list): The list of possible values + na_allowed (bool, optional): If NA is allowed. Defaults to False. + sep (Optional[str], optional): The string separator. Defaults to None. + + Returns: + pd.Index: The row indices of the rows with values that are not in possible_values. + """ + if na_allowed: + # this is only useful for dropping NAs for individual values rather than value_list + check_values = df[col].dropna() + else: + check_values = df[col] + if sep: + # for columns contain lists of values + check_values = check_values.apply( + lambda x: all(substring in possible_values for substring in x.split(sep)) + ) + else: + check_values = check_values.apply(lambda x: x in possible_values) + return check_values[check_values == False].index + + +def get_message_for_invalid_column_value( + col: str, filename: str, invalid_indices: pd.Index, possible_values: list +) -> tuple: + """This function returns the error and warning messages if the target column has rows with invalid values. + + Args: + col (str): The column to be checked + filename (str): The file name + invalid_indices (pd.Index): The row indices of the rows with invalid values + possible_values (list): The list of possible values + + Returns: + tuple: warning, error + """ + warning = "" + error = "" + # check the validity of values in the column + # concatenated possible values. This is done because of pandas typing. An integer column with one NA/blank value will be cast as a double. + possible_values = ", ".join( + [str(value).replace(".0", "") for value in possible_values] + ) + if len(invalid_indices) > 0: + error = ( + f"{filename}: Please double check your {col} column. Valid values are {possible_values}. " + f"You have {len(invalid_indices)} row(s) in your file where {col} column contains invalid values. " + f"The row(s) this occurs in are: {invalid_indices.tolist()}. Please correct.\n" + ) + return (warning, error) + + +def check_column_and_values_row_specific( + df: pd.DataFrame, + col: str, + possible_values: list, + filename: str, + na_allowed: bool = False, + required: bool = False, + sep: Optional[str] = None, +) -> tuple: + """This function checks if the column exists and checks if the values in the column have the valid values. + Currently, this function is only used in assay.py + + Args: + df (pd.DataFrame): Input dataframe + col (str): The column to be checked + possible_values (list): The list of possible values + filename (str): The file name + na_allowed (bool, optional): If NA is allowed. Defaults to False. + required (bool, optional): If the column is required. Defaults to False. + sep (Optional[str], optional): The string separator. Defaults to None. + + Returns: + tuple: warning, error + """ + warning = "" + error = "" + # check the existence of the column + have_column = checkColExist(df, col) + if not have_column: + if required: + error = "{filename}: Must have {col} column.\n".format( + filename=filename, col=col + ) + else: + warning = ( + "{filename}: Doesn't have {col} column. " + "This column will be added.\n".format(filename=filename, col=col) + ) + else: + # get the row indices + invalid_indices = get_row_indices_for_invalid_column_values( + df, col, possible_values, na_allowed, sep + ) + # generate validation message + warning, error = get_message_for_invalid_column_value( + col, filename, invalid_indices, possible_values + ) + + return (warning, error) diff --git a/genie_registry/assay.py b/genie_registry/assay.py index 33362ddc..fcca0452 100644 --- a/genie_registry/assay.py +++ b/genie_registry/assay.py @@ -1,12 +1,11 @@ """Assay information class""" import os -import yaml import pandas as pd - -from genie.example_filetype_format import FileTypeFormat +import yaml from genie import extract, load, process_functions +from genie.example_filetype_format import FileTypeFormat class Assayinfo(FileTypeFormat): @@ -16,7 +15,7 @@ class Assayinfo(FileTypeFormat): _process_kwargs = ["newPath", "databaseSynId"] - _validation_kwargs = ["project_id"] + # _validation_kwargs = ["project_id"] def _validateFilename(self, filepath_list): """Validate assay information filename""" @@ -128,7 +127,7 @@ def _get_dataframe(self, filepath_list): all_panel_info = pd.concat([all_panel_info, assay_finaldf]) return all_panel_info - def _validate(self, assay_info_df, project_id): + def _validate(self, assay_info_df): """ Validates the values of assay information file @@ -202,7 +201,7 @@ def _validate(self, assay_info_df, project_id): warn, error = process_functions.check_col_and_values( assay_info_df, "library_strategy", - read_group_headers["library_strategy"]["enum"], + ["Targeted Sequencing", "WXS"], filename="Assay_information.yaml", required=True, ) @@ -231,16 +230,6 @@ def _validate(self, assay_info_df, project_id): warning += warn total_error += error - # target_capture_kit = read_group_headers['target_capture_kit']['enum'] - # warn, error = process_functions.check_col_and_values( - # assay_info_df, - # 'target_capture_kit', - # target_capture_kit, - # filename="Assay_information.yaml", - # required=True) - # warning += warn - # total_error += error - if not process_functions.checkColExist(assay_info_df, "target_capture_kit"): total_error += ( "Assay_information.yaml: " "Must have target_capture_kit column.\n" diff --git a/tests/test_assay.py b/tests/test_assay.py index 691c693e..dc186621 100644 --- a/tests/test_assay.py +++ b/tests/test_assay.py @@ -5,9 +5,8 @@ import pandas as pd import pytest - -from genie_registry.assay import Assayinfo from genie import extract, process_functions +from genie_registry.assay import Assayinfo GDC_DATA_DICT = { "properties": { @@ -45,7 +44,7 @@ def test_validinput__validate(assay_info): assay_info_dict = { "SEQ_ASSAY_ID": ["SAGE-1", "SAGE-3"], "is_paired_end": [True, False], - "library_strategy": ["value1", "value2"], + "library_strategy": ["Targeted Sequencing", "WXS"], "library_selection": ["value1", "value2"], "platform": ["value1", "value2"], "instrument_model": ["value1", "value2"], @@ -68,18 +67,18 @@ def test_validinput__validate(assay_info): ), patch.object( process_functions, "get_gdc_data_dictionary", return_value=test_dict ) as patch_get_gdc: - error, warning = assay_info._validate(assay_info_df, "syn9999") + error, warning = assay_info._validate(assay_info_df) assert error == "" assert warning == "" patch_get_gdc.assert_called() def test_case__validate(assay_info): - """Valid input should have no errors or warnings""" + """Valid input with lowercase SEQ_ASSAY_ID, should have no errors or warnings""" assay_info_dict = { "SEQ_ASSAY_ID": ["sage-1", "SAGE-3"], "is_paired_end": [True, False], - "library_strategy": ["value1", "value2"], + "library_strategy": ["Targeted Sequencing", "WXS"], "library_selection": ["value1", "value2"], "platform": ["value1", "value2"], "instrument_model": ["value1", "value2"], @@ -102,18 +101,18 @@ def test_case__validate(assay_info): ), patch.object( process_functions, "get_gdc_data_dictionary", return_value=test_dict ) as patch_get_gdc: - error, warning = assay_info._validate(assay_info_df, "syn9999") + error, warning = assay_info._validate(assay_info_df) assert error == "" assert warning == "" patch_get_gdc.assert_called() def test_underscore__validate(assay_info): - """Valid input should have no errors or warnings""" + """Valid input with underscore in SEQ_ASSAY_ID, should have no errors or warnings""" assay_info_dict = { "SEQ_ASSAY_ID": ["SAGE_1", "SAGE-3"], "is_paired_end": [True, False], - "library_strategy": ["value1", "value2"], + "library_strategy": ["Targeted Sequencing", "WXS"], "library_selection": ["value1", "value2"], "platform": ["value1", "value2"], "instrument_model": ["value1", "value2"], @@ -136,7 +135,7 @@ def test_underscore__validate(assay_info): ), patch.object( process_functions, "get_gdc_data_dictionary", return_value=test_dict ) as patch_get_gdc: - error, warning = assay_info._validate(assay_info_df, "syn9999") + error, warning = assay_info._validate(assay_info_df) assert error == "" assert warning == "" patch_get_gdc.assert_called() @@ -149,7 +148,7 @@ def test__missingcols__validate(assay_info): with patch.object( process_functions, "get_gdc_data_dictionary", return_value=test_dict ) as patch_get_gdc: - error, warning = assay_info._validate(assay_info_df, "syn99999") + error, warning = assay_info._validate(assay_info_df) expected_errors = ( "Assay_information.yaml: Must have SEQ_ASSAY_ID column.\n" "Assay_information.yaml: Must have is_paired_end column.\n" @@ -230,7 +229,7 @@ def test_invalid__validate(assay_info): assay_info_dict = { "SEQ_ASSAY_ID": ["SAGE-1", "SAG-2"], "is_paired_end": [True, "foo"], - "library_strategy": ["foo", "ChIP-Seq"], + "library_strategy": ["foo", "WXS"], "library_selection": ["foo", "PCR"], "platform": ["foo", "Illumina"], "instrument_model": ["foo", "Illumina HiSeq 4000"], @@ -256,7 +255,7 @@ def test_invalid__validate(assay_info): ), patch.object( process_functions, "get_gdc_data_dictionary", return_value=test_dict ) as patch_get_gdc: - error, warning = assay_info._validate(assay_info_df, "syn9999") + error, warning = assay_info._validate(assay_info_df) expected_errors = ( "Assay_information.yaml: " "Please make sure all your SEQ_ASSAY_IDs start with your " @@ -270,7 +269,7 @@ def test_invalid__validate(assay_info): "This column must only be these values: value1, value2\n" "Assay_information.yaml: " "Please double check your library_strategy column. " - "This column must only be these values: value1, value2\n" + "This column must only be these values: Targeted Sequencing, WXS\n" "Assay_information.yaml: " "Please double check your platform column. " "This column must only be these values: value1, value2\n" diff --git a/tests/test_process_functions.py b/tests/test_process_functions.py index 72f72663..5e918182 100644 --- a/tests/test_process_functions.py +++ b/tests/test_process_functions.py @@ -1,7 +1,10 @@ -from unittest.mock import Mock, patch import uuid +from unittest.mock import Mock, patch import pandas as pd +import pytest +import synapseclient +from genie import process_functions from pandas.api.types import ( is_bool_dtype, is_float_dtype, @@ -9,10 +12,6 @@ is_string_dtype, ) from pandas.testing import assert_frame_equal -import pytest -import synapseclient - -from genie import process_functions DATABASE_DF = pd.DataFrame( { @@ -167,7 +166,7 @@ def test_append__append_rows(): append_rows = process_functions._append_rows(new_datadf, DATABASE_DF, "UNIQUE_KEY") append_rows.fillna("", inplace=True) expecteddf.fillna("", inplace=True) - assert append_rows.equals(expecteddf[append_rows.columns]) + assert_frame_equal(append_rows, expecteddf[append_rows.columns], check_dtype=False) def test___create_update_rowsdf(): @@ -626,7 +625,7 @@ def get_create_missing_columns_test_cases(): "name": "empty_df", "test_input": pd.DataFrame({}), "test_schema": {"col1": "float"}, - "expected_output": pd.DataFrame({"col1": []}, index=[]), + "expected_output": pd.DataFrame({"col1": []}, dtype=float), "expected_dtype": is_float_dtype, "expected_na_count": 0, }, @@ -660,7 +659,8 @@ def test_that_create_missing_columns_gets_expected_output_with_single_col_df( result = process_functions.create_missing_columns( dataset=test_cases["test_input"], schema=test_cases["test_schema"] ) - assert_frame_equal(result, test_cases["expected_output"], check_exact=True) + result.reset_index(drop=True, inplace=True) + assert_frame_equal(result, test_cases["expected_output"], check_dtype=False) assert test_cases["expected_dtype"](result.iloc[:, 0]) assert result.isna().sum().sum() == test_cases["expected_na_count"] @@ -715,3 +715,246 @@ def test_that_create_missing_columns_returns_expected_output_with_multi_col_df() assert result.isna().sum().sum() == 11 assert_frame_equal(result, expected_output, check_exact=True) + + +def get_row_indices_for_invalid_column_values_test_cases(): + return [ + { + "name": "has_na_and_allowed", + "df": pd.DataFrame({"test_col": ["Val1", "Val2", float("nan"), None]}), + "col": "test_col", + "possible_values": ["Val1"], + "na_allowed": True, + "sep": None, + "expected_index": pd.Index([1]), + }, + { + "name": "has_na_but_not_allowed", + "df": pd.DataFrame({"test_col": ["Val1", "Val2", float("nan"), None]}), + "col": "test_col", + "possible_values": ["Val1"], + "na_allowed": False, + "sep": None, + "expected_index": pd.Index([1, 2, 3]), + }, + { + "name": "invalid_values_na_allowed", + "df": pd.DataFrame({"test_col": ["val1", "VAL1", float("nan"), None]}), + "col": "test_col", + "possible_values": ["Val1"], + "na_allowed": True, + "sep": None, + "expected_index": pd.Index([0, 1]), + }, + { + "name": "invalid_values_na_not_allowed", + "df": pd.DataFrame({"test_col": ["val1", "VAL1", float("nan"), None]}), + "col": "test_col", + "possible_values": ["Val1"], + "na_allowed": False, + "sep": None, + "expected_index": pd.Index([0, 1, 2, 3]), + }, + { + "name": "values_in_list", + "df": pd.DataFrame( + { + "test_col": [ + "Val1;Val2", + "Val1;Val2;Val3", + "Val1", + "Val1;", + "Val1;None", + ] + } + ), + "col": "test_col", + "possible_values": ["Val1", "Val2"], + "na_allowed": True, + "sep": ";", + "expected_index": pd.Index([1, 3, 4]), + }, + { + "name": "valid_data", + "df": pd.DataFrame({"test_col": ["Val1", "Val2", "Val1;Val2"]}), + "col": "test_col", + "possible_values": ["Val1", "Val2"], + "na_allowed": False, + "sep": ";", + "expected_index": pd.Index([]), + }, + ] + + +@pytest.mark.parametrize( + "test_cases", + get_row_indices_for_invalid_column_values_test_cases(), + ids=lambda x: x["name"], +) +def test_get_row_indices_for_invalid_column_values(test_cases): + df = test_cases["df"] + col = test_cases["col"] + possible_values = test_cases["possible_values"] + na_allowed = test_cases["na_allowed"] + sep = test_cases["sep"] + results = process_functions.get_row_indices_for_invalid_column_values( + df, col, possible_values, na_allowed, sep + ) + assert results.equals(test_cases["expected_index"]) + + +def get_message_for_invalid_column_value_test_cases(): + return [ + { + "name": "invalid_data", + "col": "test_col", + "filename": "test_filename", + "invalid_indices": pd.Index([1, 2, 3]), + "possible_values": ["Val1"], + "expected_error": "test_filename: Please double check your test_col column. Valid values are Val1. " + "You have 3 row(s) in your file where test_col column contains invalid values. " + "The row(s) this occurs in are: [1, 2, 3]. Please correct.\n", + "expected_warning": "", + }, + { + "name": "valid_data", + "col": "test_col", + "filename": "test_filename", + "invalid_indices": pd.Index([]), + "possible_values": ["Val1", "Val2"], + "expected_error": "", + "expected_warning": "", + }, + ] + + +@pytest.mark.parametrize( + "test_cases", + get_message_for_invalid_column_value_test_cases(), + ids=lambda x: x["name"], +) +def test_get_message_for_invalid_column_value(test_cases): + col = test_cases["col"] + filename = test_cases["filename"] + invalid_indices = test_cases["invalid_indices"] + possible_values = test_cases["possible_values"] + warning, error = process_functions.get_message_for_invalid_column_value( + col, filename, invalid_indices, possible_values + ) + assert warning == test_cases["expected_warning"] + assert error == test_cases["expected_error"] + + +def check_col_and_values_row_specific_test_cases(): + return [ + { + "name": "valid_data_with_value_list", + "df": pd.DataFrame({"test_col": ["Val1", "Val2", "Val1;Val2"]}), + "col": "test_col", + "possible_values": ["Val1", "Val2"], + "filename": "test_filename", + "na_allowed": True, + "required": True, + "sep": ";", + "expected_error": "", + "expected_warning": "", + }, + { + "name": "valid_data_with_individual_value_na_allowed", + "df": pd.DataFrame({"test_col": ["Val1", "Val2", float("nan"), None]}), + "col": "test_col", + "possible_values": ["Val1", "Val2"], + "filename": "test_filename", + "na_allowed": True, + "required": True, + "sep": ";", + "expected_error": "", + "expected_warning": "", + }, + { + "name": "missing_required_column", + "df": pd.DataFrame({"test_col": ["Val1", "Val2", "Val1;Val2"]}), + "col": "test_col1", + "possible_values": ["Val1"], + "filename": "test_filename", + "na_allowed": True, + "required": True, + "sep": ";", + "expected_error": "test_filename: Must have test_col1 column.\n", + "expected_warning": "", + }, + { + "name": "missing_optional_column", + "df": pd.DataFrame({"test_col": ["Val1", "Val2", "Val1;Val2"]}), + "col": "test_col1", + "possible_values": ["Val1"], + "filename": "test_filename", + "na_allowed": True, + "required": False, + "sep": ";", + "expected_error": "", + "expected_warning": "test_filename: Doesn't have test_col1 column. This column will be added.\n", + }, + { + "name": "invalid_data_with_value_list", + "df": pd.DataFrame({"test_col": ["Val1", "Val2", "Val1;Val2"]}), + "col": "test_col", + "possible_values": ["Val1"], + "filename": "test_filename", + "na_allowed": True, + "required": True, + "sep": ";", + "expected_error": "test_filename: Please double check your test_col column. Valid values are Val1. " + "You have 2 row(s) in your file where test_col column contains invalid values. " + "The row(s) this occurs in are: [1, 2]. Please correct.\n", + "expected_warning": "", + }, + { + "name": "invalid_data_with_individual_value_na_not_allowed", + "df": pd.DataFrame({"test_col": ["Val1", "Val2", "", float("nan"), None]}), + "col": "test_col", + "possible_values": ["Val1", "Val2"], + "filename": "test_filename", + "na_allowed": False, + "required": True, + "sep": None, + "expected_error": "test_filename: Please double check your test_col column. Valid values are Val1, Val2. " + "You have 3 row(s) in your file where test_col column contains invalid values. " + "The row(s) this occurs in are: [2, 3, 4]. Please correct.\n", + "expected_warning": "", + }, + { + "name": "invalid_data_with_individual_value_na_allowed", + "df": pd.DataFrame({"test_col": ["Val1", "Val2", "", float("nan"), None]}), + "col": "test_col", + "possible_values": ["Val1"], + "filename": "test_filename", + "na_allowed": True, + "required": True, + "sep": None, + "expected_error": "test_filename: Please double check your test_col column. Valid values are Val1. " + "You have 2 row(s) in your file where test_col column contains invalid values. " + "The row(s) this occurs in are: [1, 2]. Please correct.\n", + "expected_warning": "", + }, + ] + + +@pytest.mark.parametrize( + "test_cases", + check_col_and_values_row_specific_test_cases(), + ids=lambda x: x["name"], +) +def test_check_col_and_values_row_specific(test_cases): + df = test_cases["df"] + col = test_cases["col"] + possible_values = test_cases["possible_values"] + filename = test_cases["filename"] + na_allowed = test_cases["na_allowed"] + required = test_cases["required"] + sep = test_cases["sep"] + warning, error = process_functions.check_column_and_values_row_specific( + df, col, possible_values, filename, na_allowed, required, sep + ) + assert warning == test_cases["expected_warning"] + assert error == test_cases["expected_error"]