From 9776d1269dc2eb71092ed00fb859e0512dfb9678 Mon Sep 17 00:00:00 2001 From: rxu17 <26471741+rxu17@users.noreply.github.com> Date: Thu, 2 Nov 2023 17:52:27 -0700 Subject: [PATCH] add code for allele validation - initial --- genie/validate.py | 59 +++++++++++++++++++++++++++ genie_registry/maf.py | 15 +++++++ genie_registry/vcf.py | 16 ++++++++ tests/test_maf.py | 8 ++++ tests/test_validate.py | 91 ++++++++++++++++++++++++++++++++++++++++++ tests/test_vcf.py | 20 +++++----- 6 files changed, 200 insertions(+), 9 deletions(-) diff --git a/genie/validate.py b/genie/validate.py index 10d8f586..d8bcd9c7 100644 --- a/genie/validate.py +++ b/genie/validate.py @@ -1,4 +1,5 @@ #!/usr/bin/env python3 +import re import logging from typing import Dict, List, Optional @@ -415,3 +416,61 @@ def standardize_string_for_validation( return standardized_str else: return input_string + + +def get_invalid_allele_rows( + input_data: pd.DataFrame, + input_col: str, + allowed_alleles: list, + ignore_case: bool = False, +) -> pd.Index: + """ + Find invalid indices in a DataFrame column based on allowed allele values. + + Args: + input_data (pd.DataFrame): The DataFrame to search. + input_col (str): The name of the column to check. + allowed_alleles (list): The list of allowed allele values. + ignore_case (bool, optional): whether to perform case-insensitive matching + + Returns: + pd.Index: A pandas index object indicating the row indices that + don't match the allowed alleles + """ + search_str = rf"^[{''.join(allowed_alleles)}]+$" + if ignore_case: + flags = re.IGNORECASE + else: + flags = 0 # no flags + # NAs should not be considered as a match + matching_indices = input_data[input_col].str.match( + search_str, flags=flags, na=False + ) + invalid_indices = input_data[~matching_indices].index + return invalid_indices + + +def get_allele_validation_message( + invalid_indices: pd.Series, invalid_col: str, allowed_alleles: list, fileformat: str +) -> tuple: + """Creates the error/warning message for the check for invalid alleles + + Args: + invalid_indices (pd.Series): the row indices that + have invalid alleles + invalid_col (str): The column with the invalid values + allowed_alleles (list): The list of allowed allele values. + fileformat (str): Name of the fileformat + + Returns: + tuple: The errors and warnings from the allele validation + Defaults to blank strings + """ + errors = "" + warnings = "" + if len(invalid_indices) > 0: + errors = ( + f"{fileformat}: Your {invalid_col} column has invalid allele values. " + f"These are the accepted allele values: {allowed_alleles}.\n" + ) + return errors, warnings diff --git a/genie_registry/maf.py b/genie_registry/maf.py index ab8f9193..4575d189 100644 --- a/genie_registry/maf.py +++ b/genie_registry/maf.py @@ -294,6 +294,21 @@ def _validate(self, mutationDF): ) total_error.write(errors) warning.write(warnings) + + # TODO: add these lists as class attribute or global + allele_cols = ["REFERENCE_ALLELE", "TUMOR_SEQ_ALLELE1", "TUMOR_SEQ_ALLELE2"] + allowed_alleles = ['A','T','C','G','N', ' ', '-'] + for allele_col in allele_cols: + if process_functions.checkColExist(mutationDF, allele_col): + invalid_indices = validate.get_invalid_allele_rows( + mutationDF, allele_col, allowed_alleles = allowed_alleles, ignore_case = True + ) + errors, warnings = validate.get_allele_validation_message( + invalid_indices, invalid_col = allele_col, allowed_alleles = allowed_alleles, fileformat="maf" + ) + total_error.write(errors) + warning.write(warnings) + return total_error.getvalue(), warning.getvalue() def _cross_validate(self, mutationDF: pd.DataFrame) -> tuple: diff --git a/genie_registry/vcf.py b/genie_registry/vcf.py index 71ad86a4..75525bd2 100644 --- a/genie_registry/vcf.py +++ b/genie_registry/vcf.py @@ -137,6 +137,22 @@ def _validate(self, vcfdf): total_error += error warning += warn + # TODO: add this as class attribute or global + allele_col = "REF" + allowed_alleles = ["A", "T", "C", "G", "N"] + if process_functions.checkColExist(vcfdf, allele_col): + invalid_indices = validate.get_invalid_allele_rows( + vcfdf, allele_col, allowed_alleles=allowed_alleles, ignore_case=True + ) + errors, warnings = validate.get_allele_validation_message( + invalid_indices, + invalid_col=allele_col, + allowed_alleles=allowed_alleles, + fileformat="vcf", + ) + total_error += errors + warning += warnings + # No white spaces white_space = vcfdf.apply(lambda x: contains_whitespace(x), axis=1) if sum(white_space) > 0: diff --git a/tests/test_maf.py b/tests/test_maf.py index ef07d54d..9d9db949 100644 --- a/tests/test_maf.py +++ b/tests/test_maf.py @@ -94,6 +94,8 @@ def test_firstcolumn_validation(maf_class): "maf: First column header must be " "one of these: CHROMOSOME, HUGO_SYMBOL, " "TUMOR_SAMPLE_BARCODE.\n" + "maf: Your REFERENCE_ALLELE column has invalid allele values. " + "These are the accepted allele values: ['A', 'T', 'C', 'G', 'N', ' ', '-'].\n" ) assert error == expectedErrors assert warning == "" @@ -147,6 +149,10 @@ def test_errors_validation(maf_class): "This column must only be these values: 1, 2, 3, 4, 5, 6, 7, 8, 9, " "10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, X, Y, MT\n" "maf: TUMOR_SAMPLE_BARCODE must start with GENIE-SAGE\n" + "maf: Your REFERENCE_ALLELE column has invalid allele values. " + "These are the accepted allele values: ['A', 'T', 'C', 'G', 'N', ' ', '-'].\n" + "maf: Your TUMOR_SEQ_ALLELE2 column has invalid allele values. " + "These are the accepted allele values: ['A', 'T', 'C', 'G', 'N', ' ', '-'].\n" ) expectedWarnings = ( "maf: " @@ -195,6 +201,8 @@ def test_invalid_validation(maf_class): "maf: " "TUMOR_SEQ_ALLELE2 can't have any blank or null values.\n" "maf: TUMOR_SAMPLE_BARCODE must start with GENIE-SAGE\n" + "maf: Your TUMOR_SEQ_ALLELE2 column has invalid allele values. " + "These are the accepted allele values: ['A', 'T', 'C', 'G', 'N', ' ', '-'].\n" ) expectedWarnings = ( "maf: TUMOR_SEQ_ALLELE2 column contains 'NA' values, " diff --git a/tests/test_validate.py b/tests/test_validate.py index 3589166b..01b87d7d 100644 --- a/tests/test_validate.py +++ b/tests/test_validate.py @@ -768,3 +768,94 @@ def test_that_standardize_string_for_validation_returns_expected( allow_underscore=allow_underscore, ) assert test_str == expected + + +@pytest.mark.parametrize( + "input,expected_index,allowed_alleles,ignore_case", + [ + ( + pd.DataFrame( + {"REFERENCE_ALLELE": ["ACGT-G", "A-CGT ", "A", "C", "T", "G", "-", " "]} + ), + pd.Index([]), + ["A", "T", "C", "G", " ", "-"], + True, + ), + ( + pd.DataFrame({"REFERENCE_ALLELE": ["acgt-g", "acgt", " "]}), + pd.Index([]), + ["A", "T", "C", "G", " ", "-"], + True, + ), + ( + pd.DataFrame({"REFERENCE_ALLELE": ["@##", "ACGTX"]}), + pd.Index([0, 1]), + ["A", "T", "C", "G", " ", "-"], + True, + ), + ( + pd.DataFrame({"REFERENCE_ALLELE": ["XXX", "ACGT"]}), + pd.Index([0]), + ["A", "T", "C", "G", " ", "-"], + True, + ), + ( + pd.DataFrame({"REFERENCE_ALLELE": ["ACGT-G", pd.NA, None]}), + pd.Index([1, 2]), + ["A", "T", "C", "G", " ", "-"], + True, + ), + ( + pd.DataFrame({"REFERENCE_ALLELE": ["acgt-G"]}), + pd.Index([0]), + ["A", "T", "C", "G", " ", "-"], + False, + ), + ], + ids=[ + "correct_alleles", + "correct_alleles_case", + "invalid_special_chars", + "invalid_chars", + "missing_entries", + "case_not_ignored", + ], +) +def test_that_get_invalid_allele_rows_returns_expected( + input, expected_index, allowed_alleles, ignore_case +): + invalid_rows = validate.get_invalid_allele_rows( + input, + input_col="REFERENCE_ALLELE", + allowed_alleles=allowed_alleles, + ignore_case=ignore_case, + ) + assert invalid_rows.equals(expected_index) + + +@pytest.mark.parametrize( + "input_invalid_rows,expected_error,expected_warning", + [ + ( + pd.Index([1, 2, 3]), + ( + "maf: Your REFERENCE_ALLELE column has invalid allele values. " + "These are the accepted allele values: ['A', 'C', 'T', 'G', ' ', '-'].\n" + ), + "", + ), + ([], "", ""), + ], + ids=["has_invalid_alleles", "has_no_invalid_alleles"], +) +def test_that_get_allele_validation_message_returns_expected( + input_invalid_rows, expected_error, expected_warning +): + error, warning = validate.get_allele_validation_message( + input_invalid_rows, + invalid_col="REFERENCE_ALLELE", + allowed_alleles=["A", "C", "T", "G", " ", "-"], + fileformat="maf", + ) + assert error == expected_error + assert warning == expected_warning diff --git a/tests/test_vcf.py b/tests/test_vcf.py index 1c78d87f..6cff29d0 100644 --- a/tests/test_vcf.py +++ b/tests/test_vcf.py @@ -29,7 +29,7 @@ def test_validation_valid_no_samples(vcf_class): "#CHROM": ["2", "9", "12"], "POS": [69688533, 99401860, 53701241], "ID": ["AAK1", "AAED1", "AAAS"], - "REF": ["AAK1", "AAED1", "AAAS"], + "REF": ["AANT", "AACG", "AAAN"], "ALT": ["AAK1", "AAED1", "AAAS"], "QUAL": ["AAK1", "AAED1", "AAAS"], "FILTER": ["AAK1", "AAED1", "AAAS"], @@ -47,7 +47,7 @@ def test_validation_valid_one_sample_tumor(vcf_class): "#CHROM": ["2", "9", "12"], "POS": [69688533, 99401860, 53701241], "ID": ["AAK1", "AAED1", "AAAS"], - "REF": ["AAK1", "AAED1", "AAAS"], + "REF": ["AANT", "AACG", "AAAN"], "ALT": ["AAK1", "AAED1", "AAAS"], "QUAL": ["AAK1", "AAED1", "AAAS"], "FILTER": ["AAK1", "AAED1", "AAAS"], @@ -67,7 +67,7 @@ def test_validation_valid_one_sample(vcf_class): "#CHROM": ["2", "9", "12"], "POS": [69688533, 99401860, 53701241], "ID": ["AAK1", "AAED1", "AAAS"], - "REF": ["AAK1", "AAED1", "AAAS"], + "REF": ["AANT", "AACG", "AAAN"], "ALT": ["AAK1", "AAED1", "AAAS"], "QUAL": ["AAK1", "AAED1", "AAAS"], "FILTER": ["AAK1", "AAED1", "AAAS"], @@ -88,7 +88,7 @@ def test_validation_missing_format_col(vcf_class): "#CHROM": ["2", "9", "12"], "POS": [69688533, 99401860, 53701241], "ID": ["AAK1", "AAED1", "AAAS"], - "REF": ["AAK1", "AAED1", "AAAS"], + "REF": ["AANT", "AACG", "AAAN"], "ALT": ["AAK1", "AAED1", "AAAS"], "QUAL": ["AAK1", "AAED1", "AAAS"], "FILTER": ["AAK1", "AAED1", "AAAS"], @@ -107,7 +107,7 @@ def test_validation_invalid_one_sample(vcf_class): "#CHROM": ["2", "9", "12"], "POS": [69688533, 99401860, 53701241], "ID": ["AAK1", "AAED1", "AAAS"], - "REF": ["AAK1", "AAED1", "AAAS"], + "REF": ["AANT", "AACG", "AAAN"], "ALT": ["AAK1", "AAED1", "AAAS"], "QUAL": ["AAK1", "AAED1", "AAAS"], "FILTER": ["AAK1", "AAED1", "AAAS"], @@ -130,7 +130,7 @@ def test_validation_valid_two_samples(vcf_class): "#CHROM": ["2", "9", "12"], "POS": [69688533, 99401860, 53701241], "ID": ["AAK1", "AAED1", "AAAS"], - "REF": ["AAK1", "AAED1", "AAAS"], + "REF": ["AANT", "AACG", "AAAN"], "ALT": ["AAK1", "AAED1", "AAAS"], "QUAL": ["AAK1", "AAED1", "AAAS"], "FILTER": ["AAK1", "AAED1", "AAAS"], @@ -151,7 +151,7 @@ def test_validation_invalid_two_samples_tumor(vcf_class): "#CHROM": ["2", "9", "12"], "POS": [69688533, 99401860, 53701241], "ID": ["AAK1", "AAED1", "AAAS"], - "REF": ["AAK1", "AAED1", "AAAS"], + "REF": ["AANT", "AACG", "AAAN"], "ALT": ["AAK1", "AAED1", "AAAS"], "QUAL": ["AAK1", "AAED1", "AAAS"], "FILTER": ["AAK1", "AAED1", "AAAS"], @@ -172,7 +172,7 @@ def test_validation_invalid_two_samples_normal(vcf_class): "#CHROM": ["2", "9", "12"], "POS": [69688533, 99401860, 53701241], "ID": ["AAK1", "AAED1", "AAAS"], - "REF": ["AAK1", "AAED1", "AAAS"], + "REF": ["AANT", "AACG", "AAAN"], "ALT": ["AAK1", "AAED1", "AAAS"], "QUAL": ["AAK1", "AAED1", "AAAS"], "FILTER": ["AAK1", "AAED1", "AAAS"], @@ -193,7 +193,7 @@ def test_validation_invalid_white_space(vcf_class): "#CHROMM": ["2", "9", "12"], "POS": [69688533, 99401860, 53701241], "ID": ["AAK1", "AAED1", "AAAS"], - "REF": ["AAK1", "AAED1", "AAAS"], + "REF": ["AANT", "AACG", "AAAN"], "ALT": ["AAK1", "AAED1", "AAAS"], "QUAL": ["AAK1", "AAED1", "AAAS"], "FILTER": ["AAK1", "AA ED1", "AAAS"], @@ -231,6 +231,8 @@ def test_validation_invalid_content(vcf_class): "space delimited instead of tab delimited.\n" "vcf: Please double check your #CHROM column. This column must only be these values: " "1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, X, Y, MT\n" + "vcf: Your REF column has invalid allele values. " + "These are the accepted allele values: ['A', 'T', 'C', 'G', 'N'].\n" ) expectedWarning = "vcf: Should not have the chr prefix in front of chromosomes.\n" assert error == expectedError