diff --git a/genie/transform.py b/genie/transform.py index 06ca5028..14dcb947 100644 --- a/genie/transform.py +++ b/genie/transform.py @@ -1,6 +1,9 @@ """This module contains all the transformation functions used throughout the GENIE package""" +import pandas as pd +from pandas.api.types import is_integer_dtype, is_float_dtype + def _col_name_to_titlecase(string: str) -> str: """Convert strings to titlecase. Supports strings separated by _. @@ -21,3 +24,20 @@ def _col_name_to_titlecase(string: str) -> str: for titlecase, abbrev in abbrev_map.items(): converted_str = converted_str.replace(titlecase, abbrev) return converted_str + + +def _convert_col_with_nas_to_str(df: pd.DataFrame, col: str) -> list: + """This converts a column into str while preserving NAs""" + new_vals = [str(val) if pd.notna(val) else val for val in df[col]] + return new_vals + + +def _convert_float_col_with_nas_to_int(df: pd.DataFrame, col: str) -> list: + """This converts int column that was turned into a float col because + pandas does that with int values that have NAs back into an int col + with NAs intact""" + if is_float_dtype(df[col]) and df[col].isnull().values.any(): + new_vals = df[col].astype(pd.Int64Dtype()).tolist() + return new_vals + else: + return df[col].tolist() diff --git a/genie/validate.py b/genie/validate.py index fbc35dbe..95374c40 100644 --- a/genie/validate.py +++ b/genie/validate.py @@ -5,7 +5,14 @@ import synapseclient from synapseclient.core.exceptions import SynapseHTTPError -from genie import config, example_filetype_format, extract, load, process_functions +from genie import ( + config, + example_filetype_format, + extract, + load, + process_functions, + transform, +) logger = logging.getLogger(__name__) @@ -82,7 +89,6 @@ def validate_single_file(self, **kwargs): message: errors and warnings valid: Boolean value of validation status """ - if self.file_type not in self._format_registry: valid_result_cls = example_filetype_format.ValidationResults( errors="Your filename is incorrect! Please change your filename before you run the validator or specify --filetype if you are running the validator locally", @@ -151,7 +157,11 @@ def _check_center_input(center, center_list): def _validate_chromosome( - df: pd.DataFrame, col: str, fileformat: str, allow_chr: bool = True + df: pd.DataFrame, + col: str, + fileformat: str, + allow_chr: bool = True, + allow_na: bool = False, ) -> tuple: """Validate chromosome values @@ -159,6 +169,8 @@ def _validate_chromosome( df (pd.DataFrame): Dataframe col (str): Column header for column containing chromosome values fileformat (str): GENIE supported file format + allow_chr (bool): whether the chr prefix is allowed in the values + allow_na (bool): whether NA/blanks are allowed in the values Returns: tuple: errors and warnings @@ -177,10 +189,16 @@ def _validate_chromosome( # str(chrom).replace("chr", "") in accepted_chromosomes # for chrom in df[col] # ] - correct_chromosomes = df[col].astype(str).str.replace("chr", "") - df[col] = correct_chromosomes + # preserve NAs + df[col] = transform._convert_float_col_with_nas_to_int(df, col) + df[col] = transform._convert_col_with_nas_to_str(df, col) + df[col] = [val.replace("chr", "") if pd.notna(val) else val for val in df[col]] warning, error = process_functions.check_col_and_values( - df=df, col=col, possible_values=ACCEPTED_CHROMOSOMES, filename=fileformat + df=df, + col=col, + possible_values=ACCEPTED_CHROMOSOMES, + filename=fileformat, + na_allowed=allow_na, ) errors += error warnings += warning diff --git a/genie_registry/structural_variant.py b/genie_registry/structural_variant.py index 4ef32cdf..cb124874 100644 --- a/genie_registry/structural_variant.py +++ b/genie_registry/structural_variant.py @@ -189,6 +189,7 @@ def _validate(self, sv_df): col="NCBI_BUILD", possible_values=["GRCh37", "GRCh38"], filename="Structural Variant", + na_allowed=True, required=False, ) # total_warning.write(warn) @@ -236,7 +237,6 @@ def _validate(self, sv_df): ) # total_warning.write(warn) total_error.write(error) - # check for chromosome columns and don't allow 'chr' for now # since in the database there’s nothing with CHR chrom_cols = ["SITE1_CHROMOSOME", "SITE2_CHROMOSOME"] @@ -246,6 +246,7 @@ def _validate(self, sv_df): col=chrom_col, fileformat="Structural Variant", allow_chr=False, + allow_na=True, ) total_error.write(error) diff --git a/tests/test_sv.py b/tests/test_sv.py index c8987115..300d0fc9 100644 --- a/tests/test_sv.py +++ b/tests/test_sv.py @@ -119,13 +119,13 @@ def test_validation_no_errors(self): "SITE1_ENTREZ_GENE_ID": [1, 2, 2], "SITE2_ENTREZ_GENE_ID": [1, 3, 3], "SITE1_REGION_NUMBER": [1, 2, 2], - "NCBI_BUILD": ["GRCh38", "GRCh37", "GRCh37"], + "NCBI_BUILD": ["GRCh38", float("nan"), "GRCh37"], "BREAKPOINT_TYPE": ["PRECISE", "IMPRECISE", "IMPRECISE"], "CONNECTION_TYPE": ["3to5", "5to5", "5to5"], "DNA_SUPPORT": ["Yes", "No", "Unknown"], "RNA_Support": ["Yes", "No", "Unknown"], - "SITE1_CHROMOSOME": [1, 22, 22], - "SITE2_CHROMOSOME": ["X", "2", "2"], + "SITE1_CHROMOSOME": [1, 22, float("nan")], + "SITE2_CHROMOSOME": ["X", "2", float("nan")], "SITE1_REGION": ["IGR", "Upstream", "5_Prime_UTR Intron"], "SITE2_REGION": ["3-UTR", "3_Prime_UTR Intron", "Exon"], } diff --git a/tests/test_transform.py b/tests/test_transform.py new file mode 100644 index 00000000..2089abb1 --- /dev/null +++ b/tests/test_transform.py @@ -0,0 +1,75 @@ +"""Test genie.transform module""" +from unittest.mock import patch + +import pandas as pd +import pytest + +from genie import transform + + +class TestConvertCols: + @pytest.mark.parametrize( + "test_input, expected", + [ + (pd.DataFrame({"some_col": [10.0, float("nan")]}), ["10.0", float("nan")]), + (pd.DataFrame({"some_col": [1, None]}), ["1.0", None]), + ( + pd.DataFrame({"some_col": ["Val1", float("nan")]}), + ["Val1", float("nan")], + ), + ], + ids=["float_w_na", "int_w_na", "string_w_na"], + ) + def test_that__convert_col_with_nas_to_str_keep_na_for_any_data_type( + self, test_input, expected + ): + result = transform._convert_col_with_nas_to_str(test_input, "some_col") + assert result[0] == expected[0] + assert pd.isna(result[1]) + + @pytest.mark.parametrize( + "test_input, expected", + [ + (pd.DataFrame({"some_col": [10.0, 11.2]}), ["10.0", "11.2"]), + ( + pd.DataFrame({"some_col": ["Val1", "Val2"]}), + ["Val1", "Val2"], + ), + ], + ids=["float_no_na", "string_no_na"], + ) + def test_that__convert_col_with_nas_to_str_returns_correct_vals_with_no_na_data( + self, test_input, expected + ): + result = transform._convert_col_with_nas_to_str(test_input, "some_col") + assert result == expected + + def test_that__convert_float_col_with_nas_to_int(self): + test_input = pd.DataFrame({"some_col": [10.0, float("nan")]}) + result = transform._convert_float_col_with_nas_to_int(test_input, "some_col") + assert result[0] == 10 + assert pd.isna(result[1]) + + @pytest.mark.parametrize( + "test_input, expected", + [ + (pd.DataFrame({"some_col": [10.0, 11.2]}), [10.0, 11.2]), + ( + pd.DataFrame({"some_col": ["Val1", "Val2"]}), + ["Val1", "Val2"], + ), + (pd.DataFrame({"some_col": [10, 11]}), [10, 11]), + ], + ids=["float_no_na", "string_no_na", "int_no_na"], + ) + def test_that__convert_float_col_with_nas_to_int_does_nothing_with_no_na_data( + self, test_input, expected + ): + result = transform._convert_float_col_with_nas_to_int(test_input, "some_col") + assert result == expected + + def test_that__convert_float_col_with_nas_to_int_does_nothing_with_str_data(self): + test_input = pd.DataFrame({"some_col": ["Val1", float("nan")]}) + result = transform._convert_float_col_with_nas_to_int(test_input, "some_col") + assert result[0] == "Val1" + assert pd.isna(result[1]) diff --git a/tests/test_validate.py b/tests/test_validate.py index 45ac829f..2f893ee6 100644 --- a/tests/test_validate.py +++ b/tests/test_validate.py @@ -6,7 +6,7 @@ import synapseclient from synapseclient.core.exceptions import SynapseHTTPError -from genie import example_filetype_format, extract, load, validate +from genie import example_filetype_format, extract, load, validate, process_functions CENTER = "SAGE" CNA_ENT = synapseclient.File( @@ -380,6 +380,36 @@ def test_invalid_nochr__validate_chromosome(): assert warnings == "", "Warnings should be empty" +@pytest.mark.parametrize( + "test_na_allowed,expected_val", + [(True, True), (False, False)], + ids=[ + "allow_na_is_true", + "allow_na_is_false", + ], +) +def test_that__validate_chromosome_calls_check_col_and_values_with_correct_na_allowed_val( + test_na_allowed, expected_val +): + input_df = pd.DataFrame({"SITE1_CHROMOSOME": [2, 3, 4]}) + with patch.object( + process_functions, "check_col_and_values", return_value=("", "") + ) as check_col_and_values_mock: + validate._validate_chromosome( + df=input_df, + col="SITE1_CHROMOSOME", + fileformat="Structural Variant", + allow_na=test_na_allowed, + ) + check_col_and_values_mock.assert_called_once_with( + df=input_df, + col="SITE1_CHROMOSOME", + possible_values=validate.ACCEPTED_CHROMOSOMES, + filename="Structural Variant", + na_allowed=expected_val, + ) + + ONCOTREE_ENT = "syn222"