From ab7c9315b83e345a39040293c973449c4cb14714 Mon Sep 17 00:00:00 2001
From: danlu1 <dan.lu@sagebase.org>
Date: Wed, 13 Nov 2024 01:01:02 +0000
Subject: [PATCH] reformat code

---
 genie/process_functions.py      | 61 +++++++++++++++------
 tests/test_process_functions.py | 96 ++++++++++++++++++++++-----------
 2 files changed, 109 insertions(+), 48 deletions(-)

diff --git a/genie/process_functions.py b/genie/process_functions.py
index 6f9f1572..80a566b3 100644
--- a/genie/process_functions.py
+++ b/genie/process_functions.py
@@ -982,7 +982,13 @@ def create_missing_columns(dataset: pd.DataFrame, schema: dict) -> pd.Series:
     return dataset[list(schema.keys())]
 
 
-def get_row_indices_for_invalid_column_values(df: pd.DataFrame, col: str, possible_values: list, na_allowed: bool = False,  sep: Optional[str] = None) -> pd.Index:
+def get_row_indices_for_invalid_column_values(
+    df: pd.DataFrame,
+    col: str,
+    possible_values: list,
+    na_allowed: bool = False,
+    sep: Optional[str] = None,
+) -> pd.Index:
     """This function checks the column values against possible_values and returns row indices of invalid rows.
        Currently, this function is only used in assay.py
 
@@ -994,7 +1000,7 @@ def get_row_indices_for_invalid_column_values(df: pd.DataFrame, col: str, possib
         sep (Optional[str], optional): The string separator. Defaults to None.
 
     Returns:
-        pd.Index: The row indices of the rows with values that are not in possible_values. 
+        pd.Index: The row indices of the rows with values that are not in possible_values.
     """
     if na_allowed:
         # this is only useful for dropping NAs for individual values rather than value_list
@@ -1003,12 +1009,17 @@ def get_row_indices_for_invalid_column_values(df: pd.DataFrame, col: str, possib
         check_values = df[col]
     if sep:
         # for columns contain lists of values
-        check_values = check_values.apply(lambda x: all(substring in possible_values for substring in x.split(sep)))
-    else: 
+        check_values = check_values.apply(
+            lambda x: all(substring in possible_values for substring in x.split(sep))
+        )
+    else:
         check_values = check_values.apply(lambda x: x in possible_values)
     return check_values[check_values == False].index
 
-def get_message_for_invalid_column_value(col: str, filename: str, invalid_indices: pd.Index, possible_values: list) -> tuple:
+
+def get_message_for_invalid_column_value(
+    col: str, filename: str, invalid_indices: pd.Index, possible_values: list
+) -> tuple:
     """This function returns the error and warning messages if the target column has rows with invalid values.
        Currently, this function is only used in assay.py
 
@@ -1025,15 +1036,27 @@ def get_message_for_invalid_column_value(col: str, filename: str, invalid_indice
     error = ""
     # check the validity of values in the column
     # concatenated possible values. This is done because of pandas typing. An integer column with one NA/blank value will be cast as a double.
-    possible_values = ", ".join([str(value).replace(".0", "")for value in possible_values])
-    if len(invalid_indices) > 0: 
-        error = (f"{filename}: Please double check your {col} column. Valid values are {possible_values}. "
-                f"You have {len(invalid_indices)} row(s) in your file where {col} column contains invalid values. "
-                f"The row(s) this occurs in are: {invalid_indices.tolist()}. Please correct.\n")
+    possible_values = ", ".join(
+        [str(value).replace(".0", "") for value in possible_values]
+    )
+    if len(invalid_indices) > 0:
+        error = (
+            f"{filename}: Please double check your {col} column. Valid values are {possible_values}. "
+            f"You have {len(invalid_indices)} row(s) in your file where {col} column contains invalid values. "
+            f"The row(s) this occurs in are: {invalid_indices.tolist()}. Please correct.\n"
+        )
     return (warning, error)
 
 
-def check_column_and_values_row_specific(df: pd.DataFrame, col: str, possible_values: list, filename: str, na_allowed: bool = False, required=False, sep: Optional[str] = None) -> tuple:
+def check_column_and_values_row_specific(
+    df: pd.DataFrame,
+    col: str,
+    possible_values: list,
+    filename: str,
+    na_allowed: bool = False,
+    required=False,
+    sep: Optional[str] = None,
+) -> tuple:
     """This function checks if the column exists and checks if the values in the column have the valid values.
        Currently, this function is only used in assay.py
 
@@ -1051,7 +1074,7 @@ def check_column_and_values_row_specific(df: pd.DataFrame, col: str, possible_va
     """
     warning = ""
     error = ""
-    # check the existence of the column 
+    # check the existence of the column
     have_column = checkColExist(df, col)
     if not have_column:
         if required:
@@ -1063,10 +1086,14 @@ def check_column_and_values_row_specific(df: pd.DataFrame, col: str, possible_va
                 "{filename}: Doesn't have {col} column. "
                 "This column will be added.\n".format(filename=filename, col=col)
             )
-    else: 
+    else:
         # get the row indices
-        invalid_indices = get_row_indices_for_invalid_column_values(df, col, possible_values, na_allowed, sep)
+        invalid_indices = get_row_indices_for_invalid_column_values(
+            df, col, possible_values, na_allowed, sep
+        )
         # generate validation message
-        warning, error = get_message_for_invalid_column_value(col, filename, invalid_indices, possible_values)
-    
-    return (warning, error)
\ No newline at end of file
+        warning, error = get_message_for_invalid_column_value(
+            col, filename, invalid_indices, possible_values
+        )
+
+    return (warning, error)
diff --git a/tests/test_process_functions.py b/tests/test_process_functions.py
index 89b8fdb5..48c1fd79 100644
--- a/tests/test_process_functions.py
+++ b/tests/test_process_functions.py
@@ -5,8 +5,12 @@
 import pytest
 import synapseclient
 from genie import process_functions
-from pandas.api.types import (is_bool_dtype, is_float_dtype, is_integer_dtype,
-                              is_string_dtype)
+from pandas.api.types import (
+    is_bool_dtype,
+    is_float_dtype,
+    is_integer_dtype,
+    is_string_dtype,
+)
 from pandas.testing import assert_frame_equal
 
 DATABASE_DF = pd.DataFrame(
@@ -752,10 +756,20 @@ def get_row_indices_for_invalid_column_values_test_cases():
         },
         {
             "name": "values_in_list",
-            "df": pd.DataFrame({"test_col": ["Val1;Val2", "Val1;Val2;Val3","Val1", "Val1;", "Val1;None"]}),
+            "df": pd.DataFrame(
+                {
+                    "test_col": [
+                        "Val1;Val2",
+                        "Val1;Val2;Val3",
+                        "Val1",
+                        "Val1;",
+                        "Val1;None",
+                    ]
+                }
+            ),
             "col": "test_col",
             "possible_values": ["Val1", "Val2"],
-            "na_allowed": True, 
+            "na_allowed": True,
             "sep": ";",
             "expected_index": pd.Index([1, 3, 4]),
         },
@@ -769,8 +783,12 @@ def get_row_indices_for_invalid_column_values_test_cases():
             "expected_index": pd.Index([]),
         },
     ]
+
+
 @pytest.mark.parametrize(
-    "test_cases", get_row_indices_for_invalid_column_values_test_cases(), ids=lambda x: x["name"]
+    "test_cases",
+    get_row_indices_for_invalid_column_values_test_cases(),
+    ids=lambda x: x["name"],
 )
 def test_get_row_indices_for_invalid_column_values(test_cases):
     df = test_cases["df"]
@@ -778,9 +796,12 @@ def test_get_row_indices_for_invalid_column_values(test_cases):
     possible_values = test_cases["possible_values"]
     na_allowed = test_cases["na_allowed"]
     sep = test_cases["sep"]
-    results = process_functions.get_row_indices_for_invalid_column_values(df, col, possible_values, na_allowed, sep)
+    results = process_functions.get_row_indices_for_invalid_column_values(
+        df, col, possible_values, na_allowed, sep
+    )
     assert results.equals(test_cases["expected_index"])
 
+
 def get_message_for_invalid_column_value_test_cases():
     return [
         {
@@ -789,10 +810,10 @@ def get_message_for_invalid_column_value_test_cases():
             "filename": "test_filename",
             "invalid_indices": pd.Index([1, 2, 3]),
             "possible_values": ["Val1"],
-            "expected_error": "test_filename: Please double check your test_col column. Valid values are Val1. "\
-                              "You have 3 row(s) in your file where test_col column contains invalid values. "\
-                              "The row(s) this occurs in are: [1, 2, 3]. Please correct.\n",
-            "expected_warning": ""
+            "expected_error": "test_filename: Please double check your test_col column. Valid values are Val1. "
+            "You have 3 row(s) in your file where test_col column contains invalid values. "
+            "The row(s) this occurs in are: [1, 2, 3]. Please correct.\n",
+            "expected_warning": "",
         },
         {
             "name": "valid_data",
@@ -801,21 +822,28 @@ def get_message_for_invalid_column_value_test_cases():
             "invalid_indices": pd.Index([]),
             "possible_values": ["Val1", "Val2"],
             "expected_error": "",
-            "expected_warning": ""
+            "expected_warning": "",
         },
     ]
+
+
 @pytest.mark.parametrize(
-    "test_cases", get_message_for_invalid_column_value_test_cases(), ids=lambda x: x["name"]
+    "test_cases",
+    get_message_for_invalid_column_value_test_cases(),
+    ids=lambda x: x["name"],
 )
 def test_get_message_for_invalid_column_value(test_cases):
     col = test_cases["col"]
     filename = test_cases["filename"]
     invalid_indices = test_cases["invalid_indices"]
     possible_values = test_cases["possible_values"]
-    warning, error = process_functions.get_message_for_invalid_column_value(col, filename, invalid_indices, possible_values)
+    warning, error = process_functions.get_message_for_invalid_column_value(
+        col, filename, invalid_indices, possible_values
+    )
     assert warning == test_cases["expected_warning"]
     assert error == test_cases["expected_error"]
 
+
 def check_col_and_values_row_specific_test_cases():
     return [
         {
@@ -828,7 +856,7 @@ def check_col_and_values_row_specific_test_cases():
             "required": True,
             "sep": ";",
             "expected_error": "",
-            "expected_warning": ""
+            "expected_warning": "",
         },
         {
             "name": "valid_data_with_individual_value_na_allowed",
@@ -840,7 +868,7 @@ def check_col_and_values_row_specific_test_cases():
             "required": True,
             "sep": ";",
             "expected_error": "",
-            "expected_warning": ""
+            "expected_warning": "",
         },
         {
             "name": "missing_required_column",
@@ -852,7 +880,7 @@ def check_col_and_values_row_specific_test_cases():
             "required": True,
             "sep": ";",
             "expected_error": "test_filename: Must have test_col1 column.\n",
-            "expected_warning": ""
+            "expected_warning": "",
         },
         {
             "name": "missing_optional_column",
@@ -864,7 +892,7 @@ def check_col_and_values_row_specific_test_cases():
             "required": False,
             "sep": ";",
             "expected_error": "",
-            "expected_warning": "test_filename: Doesn't have test_col1 column. This column will be added.\n"
+            "expected_warning": "test_filename: Doesn't have test_col1 column. This column will be added.\n",
         },
         {
             "name": "invalid_data_with_value_list",
@@ -875,10 +903,10 @@ def check_col_and_values_row_specific_test_cases():
             "na_allowed": True,
             "required": True,
             "sep": ";",
-            "expected_error": "test_filename: Please double check your test_col column. Valid values are Val1. "\
-                              "You have 2 row(s) in your file where test_col column contains invalid values. "\
-                              "The row(s) this occurs in are: [1, 2]. Please correct.\n",
-            "expected_warning": ""
+            "expected_error": "test_filename: Please double check your test_col column. Valid values are Val1. "
+            "You have 2 row(s) in your file where test_col column contains invalid values. "
+            "The row(s) this occurs in are: [1, 2]. Please correct.\n",
+            "expected_warning": "",
         },
         {
             "name": "invalid_data_with_individual_value_na_not_allowed",
@@ -889,10 +917,10 @@ def check_col_and_values_row_specific_test_cases():
             "na_allowed": False,
             "required": True,
             "sep": None,
-            "expected_error": "test_filename: Please double check your test_col column. Valid values are Val1, Val2. "\
-                              "You have 3 row(s) in your file where test_col column contains invalid values. "\
-                              "The row(s) this occurs in are: [2, 3, 4]. Please correct.\n",
-            "expected_warning": ""
+            "expected_error": "test_filename: Please double check your test_col column. Valid values are Val1, Val2. "
+            "You have 3 row(s) in your file where test_col column contains invalid values. "
+            "The row(s) this occurs in are: [2, 3, 4]. Please correct.\n",
+            "expected_warning": "",
         },
         {
             "name": "invalid_data_with_individual_value_na_allowed",
@@ -903,14 +931,18 @@ def check_col_and_values_row_specific_test_cases():
             "na_allowed": True,
             "required": True,
             "sep": None,
-            "expected_error": "test_filename: Please double check your test_col column. Valid values are Val1. "\
-                              "You have 2 row(s) in your file where test_col column contains invalid values. "\
-                              "The row(s) this occurs in are: [1, 2]. Please correct.\n",
-            "expected_warning": ""
+            "expected_error": "test_filename: Please double check your test_col column. Valid values are Val1. "
+            "You have 2 row(s) in your file where test_col column contains invalid values. "
+            "The row(s) this occurs in are: [1, 2]. Please correct.\n",
+            "expected_warning": "",
         },
     ]
+
+
 @pytest.mark.parametrize(
-    "test_cases", check_col_and_values_row_specific_test_cases(), ids=lambda x: x["name"]
+    "test_cases",
+    check_col_and_values_row_specific_test_cases(),
+    ids=lambda x: x["name"],
 )
 def test_check_col_and_values_row_specific(test_cases):
     df = test_cases["df"]
@@ -920,6 +952,8 @@ def test_check_col_and_values_row_specific(test_cases):
     na_allowed = test_cases["na_allowed"]
     required = test_cases["required"]
     sep = test_cases["sep"]
-    warning, error = process_functions.check_column_and_values_row_specific(df, col, possible_values, filename, na_allowed, required, sep)
+    warning, error = process_functions.check_column_and_values_row_specific(
+        df, col, possible_values, filename, na_allowed, required, sep
+    )
     assert warning == test_cases["expected_warning"]
     assert error == test_cases["expected_error"]