Skip to content

Commit

Permalink
add code for allele validation - initial
Browse files Browse the repository at this point in the history
  • Loading branch information
rxu17 committed Nov 3, 2023
1 parent c2ba7db commit 9776d12
Show file tree
Hide file tree
Showing 6 changed files with 200 additions and 9 deletions.
59 changes: 59 additions & 0 deletions genie/validate.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
#!/usr/bin/env python3
import re
import logging
from typing import Dict, List, Optional

Expand Down Expand Up @@ -415,3 +416,61 @@ def standardize_string_for_validation(
return standardized_str
else:
return input_string


def get_invalid_allele_rows(
input_data: pd.DataFrame,
input_col: str,
allowed_alleles: list,
ignore_case: bool = False,
) -> pd.Index:
"""
Find invalid indices in a DataFrame column based on allowed allele values.
Args:
input_data (pd.DataFrame): The DataFrame to search.
input_col (str): The name of the column to check.
allowed_alleles (list): The list of allowed allele values.
ignore_case (bool, optional): whether to perform case-insensitive matching
Returns:
pd.Index: A pandas index object indicating the row indices that
don't match the allowed alleles
"""
search_str = rf"^[{''.join(allowed_alleles)}]+$"
if ignore_case:
flags = re.IGNORECASE
else:
flags = 0 # no flags
# NAs should not be considered as a match
matching_indices = input_data[input_col].str.match(
search_str, flags=flags, na=False
)
invalid_indices = input_data[~matching_indices].index
return invalid_indices


def get_allele_validation_message(
invalid_indices: pd.Series, invalid_col: str, allowed_alleles: list, fileformat: str
) -> tuple:
"""Creates the error/warning message for the check for invalid alleles
Args:
invalid_indices (pd.Series): the row indices that
have invalid alleles
invalid_col (str): The column with the invalid values
allowed_alleles (list): The list of allowed allele values.
fileformat (str): Name of the fileformat
Returns:
tuple: The errors and warnings from the allele validation
Defaults to blank strings
"""
errors = ""
warnings = ""
if len(invalid_indices) > 0:
errors = (
f"{fileformat}: Your {invalid_col} column has invalid allele values. "
f"These are the accepted allele values: {allowed_alleles}.\n"
)
return errors, warnings
15 changes: 15 additions & 0 deletions genie_registry/maf.py
Original file line number Diff line number Diff line change
Expand Up @@ -294,6 +294,21 @@ def _validate(self, mutationDF):
)
total_error.write(errors)
warning.write(warnings)

# TODO: add these lists as class attribute or global
allele_cols = ["REFERENCE_ALLELE", "TUMOR_SEQ_ALLELE1", "TUMOR_SEQ_ALLELE2"]
allowed_alleles = ['A','T','C','G','N', ' ', '-']
for allele_col in allele_cols:
if process_functions.checkColExist(mutationDF, allele_col):
invalid_indices = validate.get_invalid_allele_rows(
mutationDF, allele_col, allowed_alleles = allowed_alleles, ignore_case = True
)
errors, warnings = validate.get_allele_validation_message(
invalid_indices, invalid_col = allele_col, allowed_alleles = allowed_alleles, fileformat="maf"
)
total_error.write(errors)
warning.write(warnings)

return total_error.getvalue(), warning.getvalue()

def _cross_validate(self, mutationDF: pd.DataFrame) -> tuple:
Expand Down
16 changes: 16 additions & 0 deletions genie_registry/vcf.py
Original file line number Diff line number Diff line change
Expand Up @@ -137,6 +137,22 @@ def _validate(self, vcfdf):
total_error += error
warning += warn

# TODO: add this as class attribute or global
allele_col = "REF"
allowed_alleles = ["A", "T", "C", "G", "N"]
if process_functions.checkColExist(vcfdf, allele_col):
invalid_indices = validate.get_invalid_allele_rows(
vcfdf, allele_col, allowed_alleles=allowed_alleles, ignore_case=True
)
errors, warnings = validate.get_allele_validation_message(
invalid_indices,
invalid_col=allele_col,
allowed_alleles=allowed_alleles,
fileformat="vcf",
)
total_error += errors
warning += warnings

# No white spaces
white_space = vcfdf.apply(lambda x: contains_whitespace(x), axis=1)
if sum(white_space) > 0:
Expand Down
8 changes: 8 additions & 0 deletions tests/test_maf.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,8 @@ def test_firstcolumn_validation(maf_class):
"maf: First column header must be "
"one of these: CHROMOSOME, HUGO_SYMBOL, "
"TUMOR_SAMPLE_BARCODE.\n"
"maf: Your REFERENCE_ALLELE column has invalid allele values. "
"These are the accepted allele values: ['A', 'T', 'C', 'G', 'N', ' ', '-'].\n"
)
assert error == expectedErrors
assert warning == ""
Expand Down Expand Up @@ -147,6 +149,10 @@ def test_errors_validation(maf_class):
"This column must only be these values: 1, 2, 3, 4, 5, 6, 7, 8, 9, "
"10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, X, Y, MT\n"
"maf: TUMOR_SAMPLE_BARCODE must start with GENIE-SAGE\n"
"maf: Your REFERENCE_ALLELE column has invalid allele values. "
"These are the accepted allele values: ['A', 'T', 'C', 'G', 'N', ' ', '-'].\n"
"maf: Your TUMOR_SEQ_ALLELE2 column has invalid allele values. "
"These are the accepted allele values: ['A', 'T', 'C', 'G', 'N', ' ', '-'].\n"
)
expectedWarnings = (
"maf: "
Expand Down Expand Up @@ -195,6 +201,8 @@ def test_invalid_validation(maf_class):
"maf: "
"TUMOR_SEQ_ALLELE2 can't have any blank or null values.\n"
"maf: TUMOR_SAMPLE_BARCODE must start with GENIE-SAGE\n"
"maf: Your TUMOR_SEQ_ALLELE2 column has invalid allele values. "
"These are the accepted allele values: ['A', 'T', 'C', 'G', 'N', ' ', '-'].\n"
)
expectedWarnings = (
"maf: TUMOR_SEQ_ALLELE2 column contains 'NA' values, "
Expand Down
91 changes: 91 additions & 0 deletions tests/test_validate.py
Original file line number Diff line number Diff line change
Expand Up @@ -768,3 +768,94 @@ def test_that_standardize_string_for_validation_returns_expected(
allow_underscore=allow_underscore,
)
assert test_str == expected


@pytest.mark.parametrize(
"input,expected_index,allowed_alleles,ignore_case",
[
(
pd.DataFrame(
{"REFERENCE_ALLELE": ["ACGT-G", "A-CGT ", "A", "C", "T", "G", "-", " "]}
),
pd.Index([]),
["A", "T", "C", "G", " ", "-"],
True,
),
(
pd.DataFrame({"REFERENCE_ALLELE": ["acgt-g", "acgt", " "]}),
pd.Index([]),
["A", "T", "C", "G", " ", "-"],
True,
),
(
pd.DataFrame({"REFERENCE_ALLELE": ["@##", "ACGTX"]}),
pd.Index([0, 1]),
["A", "T", "C", "G", " ", "-"],
True,
),
(
pd.DataFrame({"REFERENCE_ALLELE": ["XXX", "ACGT"]}),
pd.Index([0]),
["A", "T", "C", "G", " ", "-"],
True,
),
(
pd.DataFrame({"REFERENCE_ALLELE": ["ACGT-G", pd.NA, None]}),
pd.Index([1, 2]),
["A", "T", "C", "G", " ", "-"],
True,
),
(
pd.DataFrame({"REFERENCE_ALLELE": ["acgt-G"]}),
pd.Index([0]),
["A", "T", "C", "G", " ", "-"],
False,
),
],
ids=[
"correct_alleles",
"correct_alleles_case",
"invalid_special_chars",
"invalid_chars",
"missing_entries",
"case_not_ignored",
],
)
def test_that_get_invalid_allele_rows_returns_expected(
input, expected_index, allowed_alleles, ignore_case
):
invalid_rows = validate.get_invalid_allele_rows(
input,
input_col="REFERENCE_ALLELE",
allowed_alleles=allowed_alleles,
ignore_case=ignore_case,
)
assert invalid_rows.equals(expected_index)


@pytest.mark.parametrize(
"input_invalid_rows,expected_error,expected_warning",
[
(
pd.Index([1, 2, 3]),
(
"maf: Your REFERENCE_ALLELE column has invalid allele values. "
"These are the accepted allele values: ['A', 'C', 'T', 'G', ' ', '-'].\n"
),
"",
),
([], "", ""),
],
ids=["has_invalid_alleles", "has_no_invalid_alleles"],
)
def test_that_get_allele_validation_message_returns_expected(
input_invalid_rows, expected_error, expected_warning
):
error, warning = validate.get_allele_validation_message(
input_invalid_rows,
invalid_col="REFERENCE_ALLELE",
allowed_alleles=["A", "C", "T", "G", " ", "-"],
fileformat="maf",
)
assert error == expected_error
assert warning == expected_warning
20 changes: 11 additions & 9 deletions tests/test_vcf.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ def test_validation_valid_no_samples(vcf_class):
"#CHROM": ["2", "9", "12"],
"POS": [69688533, 99401860, 53701241],
"ID": ["AAK1", "AAED1", "AAAS"],
"REF": ["AAK1", "AAED1", "AAAS"],
"REF": ["AANT", "AACG", "AAAN"],
"ALT": ["AAK1", "AAED1", "AAAS"],
"QUAL": ["AAK1", "AAED1", "AAAS"],
"FILTER": ["AAK1", "AAED1", "AAAS"],
Expand All @@ -47,7 +47,7 @@ def test_validation_valid_one_sample_tumor(vcf_class):
"#CHROM": ["2", "9", "12"],
"POS": [69688533, 99401860, 53701241],
"ID": ["AAK1", "AAED1", "AAAS"],
"REF": ["AAK1", "AAED1", "AAAS"],
"REF": ["AANT", "AACG", "AAAN"],
"ALT": ["AAK1", "AAED1", "AAAS"],
"QUAL": ["AAK1", "AAED1", "AAAS"],
"FILTER": ["AAK1", "AAED1", "AAAS"],
Expand All @@ -67,7 +67,7 @@ def test_validation_valid_one_sample(vcf_class):
"#CHROM": ["2", "9", "12"],
"POS": [69688533, 99401860, 53701241],
"ID": ["AAK1", "AAED1", "AAAS"],
"REF": ["AAK1", "AAED1", "AAAS"],
"REF": ["AANT", "AACG", "AAAN"],
"ALT": ["AAK1", "AAED1", "AAAS"],
"QUAL": ["AAK1", "AAED1", "AAAS"],
"FILTER": ["AAK1", "AAED1", "AAAS"],
Expand All @@ -88,7 +88,7 @@ def test_validation_missing_format_col(vcf_class):
"#CHROM": ["2", "9", "12"],
"POS": [69688533, 99401860, 53701241],
"ID": ["AAK1", "AAED1", "AAAS"],
"REF": ["AAK1", "AAED1", "AAAS"],
"REF": ["AANT", "AACG", "AAAN"],
"ALT": ["AAK1", "AAED1", "AAAS"],
"QUAL": ["AAK1", "AAED1", "AAAS"],
"FILTER": ["AAK1", "AAED1", "AAAS"],
Expand All @@ -107,7 +107,7 @@ def test_validation_invalid_one_sample(vcf_class):
"#CHROM": ["2", "9", "12"],
"POS": [69688533, 99401860, 53701241],
"ID": ["AAK1", "AAED1", "AAAS"],
"REF": ["AAK1", "AAED1", "AAAS"],
"REF": ["AANT", "AACG", "AAAN"],
"ALT": ["AAK1", "AAED1", "AAAS"],
"QUAL": ["AAK1", "AAED1", "AAAS"],
"FILTER": ["AAK1", "AAED1", "AAAS"],
Expand All @@ -130,7 +130,7 @@ def test_validation_valid_two_samples(vcf_class):
"#CHROM": ["2", "9", "12"],
"POS": [69688533, 99401860, 53701241],
"ID": ["AAK1", "AAED1", "AAAS"],
"REF": ["AAK1", "AAED1", "AAAS"],
"REF": ["AANT", "AACG", "AAAN"],
"ALT": ["AAK1", "AAED1", "AAAS"],
"QUAL": ["AAK1", "AAED1", "AAAS"],
"FILTER": ["AAK1", "AAED1", "AAAS"],
Expand All @@ -151,7 +151,7 @@ def test_validation_invalid_two_samples_tumor(vcf_class):
"#CHROM": ["2", "9", "12"],
"POS": [69688533, 99401860, 53701241],
"ID": ["AAK1", "AAED1", "AAAS"],
"REF": ["AAK1", "AAED1", "AAAS"],
"REF": ["AANT", "AACG", "AAAN"],
"ALT": ["AAK1", "AAED1", "AAAS"],
"QUAL": ["AAK1", "AAED1", "AAAS"],
"FILTER": ["AAK1", "AAED1", "AAAS"],
Expand All @@ -172,7 +172,7 @@ def test_validation_invalid_two_samples_normal(vcf_class):
"#CHROM": ["2", "9", "12"],
"POS": [69688533, 99401860, 53701241],
"ID": ["AAK1", "AAED1", "AAAS"],
"REF": ["AAK1", "AAED1", "AAAS"],
"REF": ["AANT", "AACG", "AAAN"],
"ALT": ["AAK1", "AAED1", "AAAS"],
"QUAL": ["AAK1", "AAED1", "AAAS"],
"FILTER": ["AAK1", "AAED1", "AAAS"],
Expand All @@ -193,7 +193,7 @@ def test_validation_invalid_white_space(vcf_class):
"#CHROMM": ["2", "9", "12"],
"POS": [69688533, 99401860, 53701241],
"ID": ["AAK1", "AAED1", "AAAS"],
"REF": ["AAK1", "AAED1", "AAAS"],
"REF": ["AANT", "AACG", "AAAN"],
"ALT": ["AAK1", "AAED1", "AAAS"],
"QUAL": ["AAK1", "AAED1", "AAAS"],
"FILTER": ["AAK1", "AA ED1", "AAAS"],
Expand Down Expand Up @@ -231,6 +231,8 @@ def test_validation_invalid_content(vcf_class):
"space delimited instead of tab delimited.\n"
"vcf: Please double check your #CHROM column. This column must only be these values: "
"1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, X, Y, MT\n"
"vcf: Your REF column has invalid allele values. "
"These are the accepted allele values: ['A', 'T', 'C', 'G', 'N'].\n"
)
expectedWarning = "vcf: Should not have the chr prefix in front of chromosomes.\n"
assert error == expectedError
Expand Down

0 comments on commit 9776d12

Please sign in to comment.