Skip to content

Commit

Permalink
Merge pull request #516 from Sage-Bionetworks/GEN-636-allow-na-blank
Browse files Browse the repository at this point in the history
[GEN-636] Allow NAs/blanks for unrequired columns in SV
  • Loading branch information
rxu17 authored Apr 28, 2023
2 parents f2f9ca3 + e8b3428 commit 155fe8f
Show file tree
Hide file tree
Showing 6 changed files with 155 additions and 11 deletions.
20 changes: 20 additions & 0 deletions genie/transform.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
"""This module contains all the transformation functions used throughout the GENIE
package"""

import pandas as pd
from pandas.api.types import is_integer_dtype, is_float_dtype


def _col_name_to_titlecase(string: str) -> str:
"""Convert strings to titlecase. Supports strings separated by _.
Expand All @@ -21,3 +24,20 @@ def _col_name_to_titlecase(string: str) -> str:
for titlecase, abbrev in abbrev_map.items():
converted_str = converted_str.replace(titlecase, abbrev)
return converted_str


def _convert_col_with_nas_to_str(df: pd.DataFrame, col: str) -> list:
"""This converts a column into str while preserving NAs"""
new_vals = [str(val) if pd.notna(val) else val for val in df[col]]
return new_vals


def _convert_float_col_with_nas_to_int(df: pd.DataFrame, col: str) -> list:
"""This converts int column that was turned into a float col because
pandas does that with int values that have NAs back into an int col
with NAs intact"""
if is_float_dtype(df[col]) and df[col].isnull().values.any():
new_vals = df[col].astype(pd.Int64Dtype()).tolist()
return new_vals
else:
return df[col].tolist()
30 changes: 24 additions & 6 deletions genie/validate.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,14 @@
import synapseclient
from synapseclient.core.exceptions import SynapseHTTPError

from genie import config, example_filetype_format, extract, load, process_functions
from genie import (
config,
example_filetype_format,
extract,
load,
process_functions,
transform,
)

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -82,7 +89,6 @@ def validate_single_file(self, **kwargs):
message: errors and warnings
valid: Boolean value of validation status
"""

if self.file_type not in self._format_registry:
valid_result_cls = example_filetype_format.ValidationResults(
errors="Your filename is incorrect! Please change your filename before you run the validator or specify --filetype if you are running the validator locally",
Expand Down Expand Up @@ -151,14 +157,20 @@ def _check_center_input(center, center_list):


def _validate_chromosome(
df: pd.DataFrame, col: str, fileformat: str, allow_chr: bool = True
df: pd.DataFrame,
col: str,
fileformat: str,
allow_chr: bool = True,
allow_na: bool = False,
) -> tuple:
"""Validate chromosome values
Args:
df (pd.DataFrame): Dataframe
col (str): Column header for column containing chromosome values
fileformat (str): GENIE supported file format
allow_chr (bool): whether the chr prefix is allowed in the values
allow_na (bool): whether NA/blanks are allowed in the values
Returns:
tuple: errors and warnings
Expand All @@ -177,10 +189,16 @@ def _validate_chromosome(
# str(chrom).replace("chr", "") in accepted_chromosomes
# for chrom in df[col]
# ]
correct_chromosomes = df[col].astype(str).str.replace("chr", "")
df[col] = correct_chromosomes
# preserve NAs
df[col] = transform._convert_float_col_with_nas_to_int(df, col)
df[col] = transform._convert_col_with_nas_to_str(df, col)
df[col] = [val.replace("chr", "") if pd.notna(val) else val for val in df[col]]
warning, error = process_functions.check_col_and_values(
df=df, col=col, possible_values=ACCEPTED_CHROMOSOMES, filename=fileformat
df=df,
col=col,
possible_values=ACCEPTED_CHROMOSOMES,
filename=fileformat,
na_allowed=allow_na,
)
errors += error
warnings += warning
Expand Down
3 changes: 2 additions & 1 deletion genie_registry/structural_variant.py
Original file line number Diff line number Diff line change
Expand Up @@ -189,6 +189,7 @@ def _validate(self, sv_df):
col="NCBI_BUILD",
possible_values=["GRCh37", "GRCh38"],
filename="Structural Variant",
na_allowed=True,
required=False,
)
# total_warning.write(warn)
Expand Down Expand Up @@ -236,7 +237,6 @@ def _validate(self, sv_df):
)
# total_warning.write(warn)
total_error.write(error)

# check for chromosome columns and don't allow 'chr' for now
# since in the database there’s nothing with CHR
chrom_cols = ["SITE1_CHROMOSOME", "SITE2_CHROMOSOME"]
Expand All @@ -246,6 +246,7 @@ def _validate(self, sv_df):
col=chrom_col,
fileformat="Structural Variant",
allow_chr=False,
allow_na=True,
)
total_error.write(error)

Expand Down
6 changes: 3 additions & 3 deletions tests/test_sv.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,13 +119,13 @@ def test_validation_no_errors(self):
"SITE1_ENTREZ_GENE_ID": [1, 2, 2],
"SITE2_ENTREZ_GENE_ID": [1, 3, 3],
"SITE1_REGION_NUMBER": [1, 2, 2],
"NCBI_BUILD": ["GRCh38", "GRCh37", "GRCh37"],
"NCBI_BUILD": ["GRCh38", float("nan"), "GRCh37"],
"BREAKPOINT_TYPE": ["PRECISE", "IMPRECISE", "IMPRECISE"],
"CONNECTION_TYPE": ["3to5", "5to5", "5to5"],
"DNA_SUPPORT": ["Yes", "No", "Unknown"],
"RNA_Support": ["Yes", "No", "Unknown"],
"SITE1_CHROMOSOME": [1, 22, 22],
"SITE2_CHROMOSOME": ["X", "2", "2"],
"SITE1_CHROMOSOME": [1, 22, float("nan")],
"SITE2_CHROMOSOME": ["X", "2", float("nan")],
"SITE1_REGION": ["IGR", "Upstream", "5_Prime_UTR Intron"],
"SITE2_REGION": ["3-UTR", "3_Prime_UTR Intron", "Exon"],
}
Expand Down
75 changes: 75 additions & 0 deletions tests/test_transform.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
"""Test genie.transform module"""
from unittest.mock import patch

import pandas as pd
import pytest

from genie import transform


class TestConvertCols:
@pytest.mark.parametrize(
"test_input, expected",
[
(pd.DataFrame({"some_col": [10.0, float("nan")]}), ["10.0", float("nan")]),
(pd.DataFrame({"some_col": [1, None]}), ["1.0", None]),
(
pd.DataFrame({"some_col": ["Val1", float("nan")]}),
["Val1", float("nan")],
),
],
ids=["float_w_na", "int_w_na", "string_w_na"],
)
def test_that__convert_col_with_nas_to_str_keep_na_for_any_data_type(
self, test_input, expected
):
result = transform._convert_col_with_nas_to_str(test_input, "some_col")
assert result[0] == expected[0]
assert pd.isna(result[1])

@pytest.mark.parametrize(
"test_input, expected",
[
(pd.DataFrame({"some_col": [10.0, 11.2]}), ["10.0", "11.2"]),
(
pd.DataFrame({"some_col": ["Val1", "Val2"]}),
["Val1", "Val2"],
),
],
ids=["float_no_na", "string_no_na"],
)
def test_that__convert_col_with_nas_to_str_returns_correct_vals_with_no_na_data(
self, test_input, expected
):
result = transform._convert_col_with_nas_to_str(test_input, "some_col")
assert result == expected

def test_that__convert_float_col_with_nas_to_int(self):
test_input = pd.DataFrame({"some_col": [10.0, float("nan")]})
result = transform._convert_float_col_with_nas_to_int(test_input, "some_col")
assert result[0] == 10
assert pd.isna(result[1])

@pytest.mark.parametrize(
"test_input, expected",
[
(pd.DataFrame({"some_col": [10.0, 11.2]}), [10.0, 11.2]),
(
pd.DataFrame({"some_col": ["Val1", "Val2"]}),
["Val1", "Val2"],
),
(pd.DataFrame({"some_col": [10, 11]}), [10, 11]),
],
ids=["float_no_na", "string_no_na", "int_no_na"],
)
def test_that__convert_float_col_with_nas_to_int_does_nothing_with_no_na_data(
self, test_input, expected
):
result = transform._convert_float_col_with_nas_to_int(test_input, "some_col")
assert result == expected

def test_that__convert_float_col_with_nas_to_int_does_nothing_with_str_data(self):
test_input = pd.DataFrame({"some_col": ["Val1", float("nan")]})
result = transform._convert_float_col_with_nas_to_int(test_input, "some_col")
assert result[0] == "Val1"
assert pd.isna(result[1])
32 changes: 31 additions & 1 deletion tests/test_validate.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
import synapseclient
from synapseclient.core.exceptions import SynapseHTTPError

from genie import example_filetype_format, extract, load, validate
from genie import example_filetype_format, extract, load, validate, process_functions

CENTER = "SAGE"
CNA_ENT = synapseclient.File(
Expand Down Expand Up @@ -380,6 +380,36 @@ def test_invalid_nochr__validate_chromosome():
assert warnings == "", "Warnings should be empty"


@pytest.mark.parametrize(
"test_na_allowed,expected_val",
[(True, True), (False, False)],
ids=[
"allow_na_is_true",
"allow_na_is_false",
],
)
def test_that__validate_chromosome_calls_check_col_and_values_with_correct_na_allowed_val(
test_na_allowed, expected_val
):
input_df = pd.DataFrame({"SITE1_CHROMOSOME": [2, 3, 4]})
with patch.object(
process_functions, "check_col_and_values", return_value=("", "")
) as check_col_and_values_mock:
validate._validate_chromosome(
df=input_df,
col="SITE1_CHROMOSOME",
fileformat="Structural Variant",
allow_na=test_na_allowed,
)
check_col_and_values_mock.assert_called_once_with(
df=input_df,
col="SITE1_CHROMOSOME",
possible_values=validate.ACCEPTED_CHROMOSOMES,
filename="Structural Variant",
na_allowed=expected_val,
)


ONCOTREE_ENT = "syn222"


Expand Down

0 comments on commit 155fe8f

Please sign in to comment.