Merge pull request #516 from Sage-Bionetworks/GEN-636-allow-na-blank

[GEN-636] Allow NAs/blanks for unrequired columns in SV
Sage-Bionetworks · Apr 28, 2023 · 155fe8f · 155fe8f
2 parents f2f9ca3 + e8b3428
commit 155fe8f
Show file tree

Hide file tree

Showing 6 changed files with 155 additions and 11 deletions.
diff --git a/genie/transform.py b/genie/transform.py
@@ -1,6 +1,9 @@
 """This module contains all the transformation functions used throughout the GENIE
 package"""
 
+import pandas as pd
+from pandas.api.types import is_integer_dtype, is_float_dtype
+
 
 def _col_name_to_titlecase(string: str) -> str:
     """Convert strings to titlecase. Supports strings separated by _.
@@ -21,3 +24,20 @@ def _col_name_to_titlecase(string: str) -> str:
     for titlecase, abbrev in abbrev_map.items():
         converted_str = converted_str.replace(titlecase, abbrev)
     return converted_str
+
+
+def _convert_col_with_nas_to_str(df: pd.DataFrame, col: str) -> list:
+    """This converts a column into str while preserving NAs"""
+    new_vals = [str(val) if pd.notna(val) else val for val in df[col]]
+    return new_vals
+
+
+def _convert_float_col_with_nas_to_int(df: pd.DataFrame, col: str) -> list:
+    """This converts int column that was turned into a float col because
+    pandas does that with int values that have NAs back into an int col
+    with NAs intact"""
+    if is_float_dtype(df[col]) and df[col].isnull().values.any():
+        new_vals = df[col].astype(pd.Int64Dtype()).tolist()
+        return new_vals
+    else:
+        return df[col].tolist()
diff --git a/genie/validate.py b/genie/validate.py
@@ -5,7 +5,14 @@
 import synapseclient
 from synapseclient.core.exceptions import SynapseHTTPError
 
-from genie import config, example_filetype_format, extract, load, process_functions
+from genie import (
+    config,
+    example_filetype_format,
+    extract,
+    load,
+    process_functions,
+    transform,
+)
 
 logger = logging.getLogger(__name__)
 
@@ -82,7 +89,6 @@ def validate_single_file(self, **kwargs):
             message: errors and warnings
             valid: Boolean value of validation status
         """
-
         if self.file_type not in self._format_registry:
             valid_result_cls = example_filetype_format.ValidationResults(
                 errors="Your filename is incorrect! Please change your filename before you run the validator or specify --filetype if you are running the validator locally",
@@ -151,14 +157,20 @@ def _check_center_input(center, center_list):
 
 
 def _validate_chromosome(
-    df: pd.DataFrame, col: str, fileformat: str, allow_chr: bool = True
+    df: pd.DataFrame,
+    col: str,
+    fileformat: str,
+    allow_chr: bool = True,
+    allow_na: bool = False,
 ) -> tuple:
     """Validate chromosome values
 
     Args:
         df (pd.DataFrame): Dataframe
         col (str): Column header for column containing chromosome values
         fileformat (str): GENIE supported file format
+        allow_chr (bool): whether the chr prefix is allowed in the values
+        allow_na (bool): whether NA/blanks are allowed in the values
 
     Returns:
         tuple: errors and warnings
@@ -177,10 +189,16 @@ def _validate_chromosome(
         #     str(chrom).replace("chr", "") in accepted_chromosomes
         #     for chrom in df[col]
         # ]
-        correct_chromosomes = df[col].astype(str).str.replace("chr", "")
-        df[col] = correct_chromosomes
+        # preserve NAs
+        df[col] = transform._convert_float_col_with_nas_to_int(df, col)
+        df[col] = transform._convert_col_with_nas_to_str(df, col)
+        df[col] = [val.replace("chr", "") if pd.notna(val) else val for val in df[col]]
         warning, error = process_functions.check_col_and_values(
-            df=df, col=col, possible_values=ACCEPTED_CHROMOSOMES, filename=fileformat
+            df=df,
+            col=col,
+            possible_values=ACCEPTED_CHROMOSOMES,
+            filename=fileformat,
+            na_allowed=allow_na,
         )
         errors += error
         warnings += warning

diff --git a/genie_registry/structural_variant.py b/genie_registry/structural_variant.py
@@ -189,6 +189,7 @@ def _validate(self, sv_df):
             col="NCBI_BUILD",
             possible_values=["GRCh37", "GRCh38"],
             filename="Structural Variant",
+            na_allowed=True,
             required=False,
         )
         # total_warning.write(warn)
@@ -236,7 +237,6 @@ def _validate(self, sv_df):
         )
         # total_warning.write(warn)
         total_error.write(error)
-
         # check for chromosome columns and don't allow 'chr' for now
         # since in the database there’s nothing with CHR
         chrom_cols = ["SITE1_CHROMOSOME", "SITE2_CHROMOSOME"]
@@ -246,6 +246,7 @@ def _validate(self, sv_df):
                 col=chrom_col,
                 fileformat="Structural Variant",
                 allow_chr=False,
+                allow_na=True,
             )
             total_error.write(error)
 

diff --git a/tests/test_sv.py b/tests/test_sv.py
@@ -119,13 +119,13 @@ def test_validation_no_errors(self):
                 "SITE1_ENTREZ_GENE_ID": [1, 2, 2],
                 "SITE2_ENTREZ_GENE_ID": [1, 3, 3],
                 "SITE1_REGION_NUMBER": [1, 2, 2],
-                "NCBI_BUILD": ["GRCh38", "GRCh37", "GRCh37"],
+                "NCBI_BUILD": ["GRCh38", float("nan"), "GRCh37"],
                 "BREAKPOINT_TYPE": ["PRECISE", "IMPRECISE", "IMPRECISE"],
                 "CONNECTION_TYPE": ["3to5", "5to5", "5to5"],
                 "DNA_SUPPORT": ["Yes", "No", "Unknown"],
                 "RNA_Support": ["Yes", "No", "Unknown"],
-                "SITE1_CHROMOSOME": [1, 22, 22],
-                "SITE2_CHROMOSOME": ["X", "2", "2"],
+                "SITE1_CHROMOSOME": [1, 22, float("nan")],
+                "SITE2_CHROMOSOME": ["X", "2", float("nan")],
                 "SITE1_REGION": ["IGR", "Upstream", "5_Prime_UTR Intron"],
                 "SITE2_REGION": ["3-UTR", "3_Prime_UTR Intron", "Exon"],
             }

diff --git a/tests/test_transform.py b/tests/test_transform.py
@@ -0,0 +1,75 @@
+"""Test genie.transform module"""
+from unittest.mock import patch
+
+import pandas as pd
+import pytest
+
+from genie import transform
+
+
+class TestConvertCols:
+    @pytest.mark.parametrize(
+        "test_input, expected",
+        [
+            (pd.DataFrame({"some_col": [10.0, float("nan")]}), ["10.0", float("nan")]),
+            (pd.DataFrame({"some_col": [1, None]}), ["1.0", None]),
+            (
+                pd.DataFrame({"some_col": ["Val1", float("nan")]}),
+                ["Val1", float("nan")],
+            ),
+        ],
+        ids=["float_w_na", "int_w_na", "string_w_na"],
+    )
+    def test_that__convert_col_with_nas_to_str_keep_na_for_any_data_type(
+        self, test_input, expected
+    ):
+        result = transform._convert_col_with_nas_to_str(test_input, "some_col")
+        assert result[0] == expected[0]
+        assert pd.isna(result[1])
+
+    @pytest.mark.parametrize(
+        "test_input, expected",
+        [
+            (pd.DataFrame({"some_col": [10.0, 11.2]}), ["10.0", "11.2"]),
+            (
+                pd.DataFrame({"some_col": ["Val1", "Val2"]}),
+                ["Val1", "Val2"],
+            ),
+        ],
+        ids=["float_no_na", "string_no_na"],
+    )
+    def test_that__convert_col_with_nas_to_str_returns_correct_vals_with_no_na_data(
+        self, test_input, expected
+    ):
+        result = transform._convert_col_with_nas_to_str(test_input, "some_col")
+        assert result == expected
+
+    def test_that__convert_float_col_with_nas_to_int(self):
+        test_input = pd.DataFrame({"some_col": [10.0, float("nan")]})
+        result = transform._convert_float_col_with_nas_to_int(test_input, "some_col")
+        assert result[0] == 10
+        assert pd.isna(result[1])
+
+    @pytest.mark.parametrize(
+        "test_input, expected",
+        [
+            (pd.DataFrame({"some_col": [10.0, 11.2]}), [10.0, 11.2]),
+            (
+                pd.DataFrame({"some_col": ["Val1", "Val2"]}),
+                ["Val1", "Val2"],
+            ),
+            (pd.DataFrame({"some_col": [10, 11]}), [10, 11]),
+        ],
+        ids=["float_no_na", "string_no_na", "int_no_na"],
+    )
+    def test_that__convert_float_col_with_nas_to_int_does_nothing_with_no_na_data(
+        self, test_input, expected
+    ):
+        result = transform._convert_float_col_with_nas_to_int(test_input, "some_col")
+        assert result == expected
+
+    def test_that__convert_float_col_with_nas_to_int_does_nothing_with_str_data(self):
+        test_input = pd.DataFrame({"some_col": ["Val1", float("nan")]})
+        result = transform._convert_float_col_with_nas_to_int(test_input, "some_col")
+        assert result[0] == "Val1"
+        assert pd.isna(result[1])
diff --git a/tests/test_validate.py b/tests/test_validate.py
@@ -6,7 +6,7 @@
 import synapseclient
 from synapseclient.core.exceptions import SynapseHTTPError
 
-from genie import example_filetype_format, extract, load, validate
+from genie import example_filetype_format, extract, load, validate, process_functions
 
 CENTER = "SAGE"
 CNA_ENT = synapseclient.File(
@@ -380,6 +380,36 @@ def test_invalid_nochr__validate_chromosome():
     assert warnings == "", "Warnings should be empty"
 
 
+@pytest.mark.parametrize(
+    "test_na_allowed,expected_val",
+    [(True, True), (False, False)],
+    ids=[
+        "allow_na_is_true",
+        "allow_na_is_false",
+    ],
+)
+def test_that__validate_chromosome_calls_check_col_and_values_with_correct_na_allowed_val(
+    test_na_allowed, expected_val
+):
+    input_df = pd.DataFrame({"SITE1_CHROMOSOME": [2, 3, 4]})
+    with patch.object(
+        process_functions, "check_col_and_values", return_value=("", "")
+    ) as check_col_and_values_mock:
+        validate._validate_chromosome(
+            df=input_df,
+            col="SITE1_CHROMOSOME",
+            fileformat="Structural Variant",
+            allow_na=test_na_allowed,
+        )
+        check_col_and_values_mock.assert_called_once_with(
+            df=input_df,
+            col="SITE1_CHROMOSOME",
+            possible_values=validate.ACCEPTED_CHROMOSOMES,
+            filename="Structural Variant",
+            na_allowed=expected_val,
+        )
+
+
 ONCOTREE_ENT = "syn222"