Skip to content

Commit

Permalink
[GEN-1021] update library strategy (#580)
Browse files Browse the repository at this point in the history
* update valid library_strategy values
* comment unwanted kwargs out
* introduce row specific validation
  • Loading branch information
danlu1 authored Nov 14, 2024
1 parent 42b5ff8 commit a757ad7
Show file tree
Hide file tree
Showing 4 changed files with 384 additions and 38 deletions.
115 changes: 115 additions & 0 deletions genie/process_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -980,3 +980,118 @@ def create_missing_columns(dataset: pd.DataFrame, schema: dict) -> pd.Series:
elif data_type == "boolean":
dataset[column] = dataset[column].astype(pd.BooleanDtype())
return dataset[list(schema.keys())]


def get_row_indices_for_invalid_column_values(
df: pd.DataFrame,
col: str,
possible_values: list,
na_allowed: bool = False,
sep: Optional[str] = None,
) -> pd.Index:
"""This function checks the column values against possible_values and returns row indices of invalid rows.
Args:
df (pd.DataFrame): Input dataframe
col (str): The column to be checked
possible_values (list): The list of possible values
na_allowed (bool, optional): If NA is allowed. Defaults to False.
sep (Optional[str], optional): The string separator. Defaults to None.
Returns:
pd.Index: The row indices of the rows with values that are not in possible_values.
"""
if na_allowed:
# this is only useful for dropping NAs for individual values rather than value_list
check_values = df[col].dropna()
else:
check_values = df[col]
if sep:
# for columns contain lists of values
check_values = check_values.apply(
lambda x: all(substring in possible_values for substring in x.split(sep))
)
else:
check_values = check_values.apply(lambda x: x in possible_values)
return check_values[check_values == False].index


def get_message_for_invalid_column_value(
col: str, filename: str, invalid_indices: pd.Index, possible_values: list
) -> tuple:
"""This function returns the error and warning messages if the target column has rows with invalid values.
Args:
col (str): The column to be checked
filename (str): The file name
invalid_indices (pd.Index): The row indices of the rows with invalid values
possible_values (list): The list of possible values
Returns:
tuple: warning, error
"""
warning = ""
error = ""
# check the validity of values in the column
# concatenated possible values. This is done because of pandas typing. An integer column with one NA/blank value will be cast as a double.
possible_values = ", ".join(
[str(value).replace(".0", "") for value in possible_values]
)
if len(invalid_indices) > 0:
error = (
f"{filename}: Please double check your {col} column. Valid values are {possible_values}. "
f"You have {len(invalid_indices)} row(s) in your file where {col} column contains invalid values. "
f"The row(s) this occurs in are: {invalid_indices.tolist()}. Please correct.\n"
)
return (warning, error)


def check_column_and_values_row_specific(
df: pd.DataFrame,
col: str,
possible_values: list,
filename: str,
na_allowed: bool = False,
required: bool = False,
sep: Optional[str] = None,
) -> tuple:
"""This function checks if the column exists and checks if the values in the column have the valid values.
Currently, this function is only used in assay.py
Args:
df (pd.DataFrame): Input dataframe
col (str): The column to be checked
possible_values (list): The list of possible values
filename (str): The file name
na_allowed (bool, optional): If NA is allowed. Defaults to False.
required (bool, optional): If the column is required. Defaults to False.
sep (Optional[str], optional): The string separator. Defaults to None.
Returns:
tuple: warning, error
"""
warning = ""
error = ""
# check the existence of the column
have_column = checkColExist(df, col)
if not have_column:
if required:
error = "{filename}: Must have {col} column.\n".format(
filename=filename, col=col
)
else:
warning = (
"{filename}: Doesn't have {col} column. "
"This column will be added.\n".format(filename=filename, col=col)
)
else:
# get the row indices
invalid_indices = get_row_indices_for_invalid_column_values(
df, col, possible_values, na_allowed, sep
)
# generate validation message
warning, error = get_message_for_invalid_column_value(
col, filename, invalid_indices, possible_values
)

return (warning, error)
21 changes: 5 additions & 16 deletions genie_registry/assay.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,11 @@
"""Assay information class"""

import os
import yaml

import pandas as pd

from genie.example_filetype_format import FileTypeFormat
import yaml
from genie import extract, load, process_functions
from genie.example_filetype_format import FileTypeFormat


class Assayinfo(FileTypeFormat):
Expand All @@ -16,7 +15,7 @@ class Assayinfo(FileTypeFormat):

_process_kwargs = ["newPath", "databaseSynId"]

_validation_kwargs = ["project_id"]
# _validation_kwargs = ["project_id"]

def _validateFilename(self, filepath_list):
"""Validate assay information filename"""
Expand Down Expand Up @@ -128,7 +127,7 @@ def _get_dataframe(self, filepath_list):
all_panel_info = pd.concat([all_panel_info, assay_finaldf])
return all_panel_info

def _validate(self, assay_info_df, project_id):
def _validate(self, assay_info_df):
"""
Validates the values of assay information file
Expand Down Expand Up @@ -202,7 +201,7 @@ def _validate(self, assay_info_df, project_id):
warn, error = process_functions.check_col_and_values(
assay_info_df,
"library_strategy",
read_group_headers["library_strategy"]["enum"],
["Targeted Sequencing", "WXS"],
filename="Assay_information.yaml",
required=True,
)
Expand Down Expand Up @@ -231,16 +230,6 @@ def _validate(self, assay_info_df, project_id):
warning += warn
total_error += error

# target_capture_kit = read_group_headers['target_capture_kit']['enum']
# warn, error = process_functions.check_col_and_values(
# assay_info_df,
# 'target_capture_kit',
# target_capture_kit,
# filename="Assay_information.yaml",
# required=True)
# warning += warn
# total_error += error

if not process_functions.checkColExist(assay_info_df, "target_capture_kit"):
total_error += (
"Assay_information.yaml: " "Must have target_capture_kit column.\n"
Expand Down
27 changes: 13 additions & 14 deletions tests/test_assay.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,8 @@

import pandas as pd
import pytest

from genie_registry.assay import Assayinfo
from genie import extract, process_functions
from genie_registry.assay import Assayinfo

GDC_DATA_DICT = {
"properties": {
Expand Down Expand Up @@ -45,7 +44,7 @@ def test_validinput__validate(assay_info):
assay_info_dict = {
"SEQ_ASSAY_ID": ["SAGE-1", "SAGE-3"],
"is_paired_end": [True, False],
"library_strategy": ["value1", "value2"],
"library_strategy": ["Targeted Sequencing", "WXS"],
"library_selection": ["value1", "value2"],
"platform": ["value1", "value2"],
"instrument_model": ["value1", "value2"],
Expand All @@ -68,18 +67,18 @@ def test_validinput__validate(assay_info):
), patch.object(
process_functions, "get_gdc_data_dictionary", return_value=test_dict
) as patch_get_gdc:
error, warning = assay_info._validate(assay_info_df, "syn9999")
error, warning = assay_info._validate(assay_info_df)
assert error == ""
assert warning == ""
patch_get_gdc.assert_called()


def test_case__validate(assay_info):
"""Valid input should have no errors or warnings"""
"""Valid input with lowercase SEQ_ASSAY_ID, should have no errors or warnings"""
assay_info_dict = {
"SEQ_ASSAY_ID": ["sage-1", "SAGE-3"],
"is_paired_end": [True, False],
"library_strategy": ["value1", "value2"],
"library_strategy": ["Targeted Sequencing", "WXS"],
"library_selection": ["value1", "value2"],
"platform": ["value1", "value2"],
"instrument_model": ["value1", "value2"],
Expand All @@ -102,18 +101,18 @@ def test_case__validate(assay_info):
), patch.object(
process_functions, "get_gdc_data_dictionary", return_value=test_dict
) as patch_get_gdc:
error, warning = assay_info._validate(assay_info_df, "syn9999")
error, warning = assay_info._validate(assay_info_df)
assert error == ""
assert warning == ""
patch_get_gdc.assert_called()


def test_underscore__validate(assay_info):
"""Valid input should have no errors or warnings"""
"""Valid input with underscore in SEQ_ASSAY_ID, should have no errors or warnings"""
assay_info_dict = {
"SEQ_ASSAY_ID": ["SAGE_1", "SAGE-3"],
"is_paired_end": [True, False],
"library_strategy": ["value1", "value2"],
"library_strategy": ["Targeted Sequencing", "WXS"],
"library_selection": ["value1", "value2"],
"platform": ["value1", "value2"],
"instrument_model": ["value1", "value2"],
Expand All @@ -136,7 +135,7 @@ def test_underscore__validate(assay_info):
), patch.object(
process_functions, "get_gdc_data_dictionary", return_value=test_dict
) as patch_get_gdc:
error, warning = assay_info._validate(assay_info_df, "syn9999")
error, warning = assay_info._validate(assay_info_df)
assert error == ""
assert warning == ""
patch_get_gdc.assert_called()
Expand All @@ -149,7 +148,7 @@ def test__missingcols__validate(assay_info):
with patch.object(
process_functions, "get_gdc_data_dictionary", return_value=test_dict
) as patch_get_gdc:
error, warning = assay_info._validate(assay_info_df, "syn99999")
error, warning = assay_info._validate(assay_info_df)
expected_errors = (
"Assay_information.yaml: Must have SEQ_ASSAY_ID column.\n"
"Assay_information.yaml: Must have is_paired_end column.\n"
Expand Down Expand Up @@ -230,7 +229,7 @@ def test_invalid__validate(assay_info):
assay_info_dict = {
"SEQ_ASSAY_ID": ["SAGE-1", "SAG-2"],
"is_paired_end": [True, "foo"],
"library_strategy": ["foo", "ChIP-Seq"],
"library_strategy": ["foo", "WXS"],
"library_selection": ["foo", "PCR"],
"platform": ["foo", "Illumina"],
"instrument_model": ["foo", "Illumina HiSeq 4000"],
Expand All @@ -256,7 +255,7 @@ def test_invalid__validate(assay_info):
), patch.object(
process_functions, "get_gdc_data_dictionary", return_value=test_dict
) as patch_get_gdc:
error, warning = assay_info._validate(assay_info_df, "syn9999")
error, warning = assay_info._validate(assay_info_df)
expected_errors = (
"Assay_information.yaml: "
"Please make sure all your SEQ_ASSAY_IDs start with your "
Expand All @@ -270,7 +269,7 @@ def test_invalid__validate(assay_info):
"This column must only be these values: value1, value2\n"
"Assay_information.yaml: "
"Please double check your library_strategy column. "
"This column must only be these values: value1, value2\n"
"This column must only be these values: Targeted Sequencing, WXS\n"
"Assay_information.yaml: "
"Please double check your platform column. "
"This column must only be these values: value1, value2\n"
Expand Down
Loading

0 comments on commit a757ad7

Please sign in to comment.