Sage-Bionetworks · andrewelamb · Sep 10, 2024 · Oct 1, 2024 · Oct 3, 2024 · Oct 4, 2024
@@ -914,14 +914,14 @@ def get_target_manifests(
     def list_validation(
         self,
         val_rule: str,
-        manifest_col: pd.core.series.Series,
-    ) -> tuple[list[list[str]], list[list[str]], pd.core.series.Series]:
+        manifest_col: pd.Series,
+    ) -> tuple[list[list[str]], list[list[str]], pd.Series]:
         """
         Purpose:
             Determine if values for a particular attribute are comma separated.
         Input:
             - val_rule: str, Validation rule
-            - manifest_col: pd.core.series.Series, column for a given attribute
+            - manifest_col: pd.Series, column for a given attribute
         Returns:
             - manifest_col: Input values in manifest arere-formatted to a list
             logger.error or logger.warning.
@@ -932,8 +932,8 @@ def list_validation(
         # For each 'list' (input as a string with a , delimiter) entered,
         # convert to a real list of strings, with leading and trailing
         # white spaces removed.
-        errors = []
-        warnings = []
+        errors: list[list[str]] = []
+        warnings: list[list[str]] = []
         replace_null = True
 
         csv_re = comma_separated_list_regex()
@@ -954,7 +954,10 @@ def list_validation(
                     entry=list_string,
                     node_display_name=manifest_col.name,
                 )
-
+                # Because of the above line: manifest_col = manifest_col.astype(str)
+                # this column has been turned into a string, it's unclear if any values
+                # from this column can be anything other than a string, and therefore this
+                # if statement may not be needed
                 if not isinstance(list_string, str) and entry_has_value:
                     list_error = "not_a_string"
                 elif not re.fullmatch(csv_re, list_string) and entry_has_value:
@@ -983,15 +986,15 @@ def list_validation(
     def regex_validation(
         self,
         val_rule: str,
-        manifest_col: pd.core.series.Series,
+        manifest_col: pd.Series,
     ) -> tuple[list[list[str]], list[list[str]]]:
         """
         Purpose:
             Check if values for a given manifest attribue conform to the reguar expression,
             provided in val_rule.
         Input:
             - val_rule: str, Validation rule
-            - manifest_col: pd.core.series.Series, column for a given
+            - manifest_col: pd.Series, column for a given
                 attribute in the manifest
             - dmge: DataModelGraphExplorer Object
             Using this module requres validation rules written in the following manner:
@@ -1005,8 +1008,8 @@ def regex_validation(
             - This function will return errors when the user input value
             does not match schema specifications.
             logger.error or logger.warning.
-            Errors: list[str] Error details for further storage.
-            warnings: list[str] Warning details for further storage.
+            Errors: list[list[str]] Error details for further storage.
+            warnings: list[list[str]] Warning details for further storage.
         TODO:
             move validation to convert step.
         """
@@ -1022,13 +1025,15 @@ def regex_validation(
                 f" They should be provided as follows ['regex', 'module name', 'regular expression']"
             )
 
-        errors = []
-        warnings = []
+        errors: list[list[str]] = []
+        warnings: list[list[str]] = []
 
         validation_rules = self.dmge.get_node_validation_rules(
             node_display_name=manifest_col.name
         )
-
+        # It seems like this statement can never be true
+        # self.dmge.get_node_validation_rules never returns a list with "::" even when
+        # the attribute has the "list::regex" rule
         if validation_rules and "::" in validation_rules[0]:
             validation_rules = validation_rules[0].split("::")
         # Handle case where validating re's within a list.
@@ -1096,7 +1101,7 @@ def regex_validation(
     def type_validation(
         self,
         val_rule: str,
-        manifest_col: pd.core.series.Series,
+        manifest_col: pd.Series,
     ) -> tuple[list[list[str]], list[list[str]]]:
         """
         Purpose:
@@ -1105,7 +1110,7 @@ def type_validation(
         Input:
             - val_rule: str, Validation rule, specifying input type, either
                 'float', 'int', 'num', 'str'
-            - manifest_col: pd.core.series.Series, column for a given
+            - manifest_col: pd.Series, column for a given
                 attribute in the manifest
         Returns:
             -This function will return errors when the user input value
@@ -1123,8 +1128,8 @@ def type_validation(
             "str": (str),
         }
 
-        errors = []
-        warnings = []
+        errors: list[list[str]] = []
+        warnings: list[list[str]] = []
 
         # num indicates either a float or int.
         if val_rule == "num":
@@ -1147,6 +1152,7 @@ def type_validation(
                     )
                     if vr_errors:
                         errors.append(vr_errors)
+                    # It seems impossible to get warnings with type rules
                     if vr_warnings:
                         warnings.append(vr_warnings)
         elif val_rule in ["int", "float", "str"]:
@@ -1169,6 +1175,7 @@ def type_validation(
                     )
                     if vr_errors:
                         errors.append(vr_errors)
+                    # It seems impossible to get warnings with type rules
                     if vr_warnings:
                         warnings.append(vr_warnings)
         return errors, warnings
@@ -1286,14 +1293,14 @@ def url_validation(
         return errors, warnings
 
     def _parse_validation_log(
-        self, validation_log: dict[str, pd.core.series.Series]
-    ) -> tuple[[list[str], list[str], list[str]]]:
+        self, validation_log: dict[str, pd.Series]
+    ) -> tuple[list[str], list[str], list[str]]:
         """Parse validation log, so values can be used to raise warnings/errors
         Args:
-            validation_log, dict[str, pd.core.series.Series]:
+            validation_log, dict[str, pd.Series]:
         Returns:
             invalid_rows, list: invalid rows recorded in the validation log
-            invalid_enties, list: invalid values recorded in the validation log
+            invalid_entities, list: invalid values recorded in the validation log
             manifest_ids, list:
         """
         # Initialize parameters
@@ -1313,12 +1320,15 @@ def _parse_validation_log(
         return invalid_rows, invalid_entries, manifest_ids
 
     def _merge_format_invalid_rows_values(
-        self, series_1: pd.core.series.Series, series_2: pd.core.series.Series
-    ) -> tuple[[list[str], list[str]]]:
-        """Merge two series to identify gather all invalid values, and parse out invalid rows and entries
+        self, series_1: pd.Series, series_2: pd.Series
+    ) -> tuple[list[str], list[str]]:
+        """
+        Merge two series to identify gather all invalid values,
+        and parse out invalid rows and entries
+
         Args:
-            series_1, pd.core.series.Series: first set of invalid values to extract
-            series_2, pd.core.series.Series: second set of invalid values to extract
+            series_1, pd.Series: first set of invalid values to extract
+            series_2, pd.Series: second set of invalid values to extract
         Returns:
             invalid_rows, list: invalid rows taken from both series
             invalid_entry, list: invalid values taken from both series
@@ -1342,12 +1352,14 @@ def _merge_format_invalid_rows_values(
         return invalid_rows, invalid_entry
 
     def _format_invalid_row_values(
-        self, invalid_values: dict[str, pd.core.series.Series]
-    ) -> tuple[[list[str], list[str]]]:
-        """Parse invalid_values dictionary, to extract invalid_rows and invalid_entry to be used later
-        to raise warnings or errors.
+        self, invalid_values: pd.Series
+    ) -> tuple[list[str], list[str]]:
+        """
+        Parse invalid_values, to extract invalid_rows and invalid_entry
+        to be used later to raise warnings or errors.
+
         Args:
-            invalid_values, dict[str, pd.core.series.Series]:
+            invalid_values, pd.Series:
         Returns:
             invalid_rows, list: invalid rows recorded in invalid_values
             invalid_entry, list: invalid values recorded in invalid_values
@@ -1383,9 +1395,9 @@ def _gather_set_warnings_errors(
 
         Returns:
             errors, list[str]: list of errors to raise, as appropriate, if values in current manifest do
-            not pass relevant cross mannifest validation across the target manifest(s)
+            not pass relevant cross manifest validation across the target manifest(s)
             warnings, list[str]: list of warnings to raise, as appropriate, if values in current manifest do
-            not pass relevant cross mannifest validation across the target manifest(s)
+            not pass relevant cross manifest validation across the target manifest(s)
         """
         errors: list[str] = []
         warnings: list[str] = []
@@ -1440,21 +1452,28 @@ def _remove_non_entry_from_invalid_entry_list(
         row_num: Optional[list[str]],
         attribute_name: str,
     ) -> tuple[list[str], list[str]]:
-        """Helper to remove NAs from a list of invalid entries (if applicable, and allowed), remove the row
-        too from row_num. This will make sure errors are not rasied for NA entries unless the value is required.
+        """
+        Helper to remove NAs from a list of invalid entries (if applicable, and allowed),
+          remove the row too from row_num. This will make sure errors are not raised for
+          NA entries unless the value is required.
+
         Args:
             invalid_entry, list[str]: default=None, list of entries in the source manifest where
-                invalid values were located.
-            row_num, list[str[: default=None, list of rows in the source manifest where invalid values were located
+              invalid values were located.
+            row_num, list[str[: default=None, list of rows in the source manifest where invalid
+              values were located
             attribute_name, str: source attribute name
+
         Returns:
-            invalid_entry and row_num returned with any NA and corresponding row index value removed, if applicable.
+            invalid_entry and row_num returned with any NA and corresponding row index value
+              removed, if applicable.
         """
         idx_to_remove = []
         # Check if the current attribute column is required, via the data model
         if invalid_entry and row_num:
             # Check each invalid entry and determine if it has a value and/or is required.
-            # If there is no entry and its not required, remove the NA value so an error is not raised.
+            # If there is no entry and its not required, remove the NA value so an
+            #  error is not raised.
             for idx, entry in enumerate(invalid_entry):
                 entry_has_value = self.get_entry_has_value(entry, attribute_name)
                 # If there is no value, and is not required, recored the index
@@ -1466,8 +1485,8 @@ def _remove_non_entry_from_invalid_entry_list(
                 for idx in sorted(idx_to_remove, reverse=True):
                     del invalid_entry[idx]
                     del row_num[idx]
-                # Perform check to make sure length of invalid_entry and row_num is the same. If not that would suggest
-                # there was an issue recording or removing values.
+                # Perform check to make sure length of invalid_entry and row_num is the same.
+                # If not that would suggest there was an issue recording or removing values.
                 if len(invalid_entry) != len(row_num):
                     logger.error(
                         f"There was an error handling and validating a non-entry."
@@ -1530,17 +1549,22 @@ def _gather_value_warnings_errors(
         source_attribute: str,
         value_validation_store: tuple[pd.Series, pd.Series, pd.Series],
     ) -> tuple[list[str], list[str]]:
-        """For value rule scope, find invalid rows and entries, and generate appropriate errors and warnings
+        """
+        For value rule scope, find invalid rows and entries, and generate
+          appropriate errors and warnings
+
         Args:
             val_rule, str: Validation rule
             source_attribute, str: source manifest column name
             value_validation_store, tuple(pd.Series, pd.Series, pd.Series]):
                 contains missing_values, duplicated_values, and repeat values
         Returns:
-            errors, list[str]: list of errors to raise, as appropriate, if values in current manifest do
-            not pass relevant cross mannifest validation across the target manifest(s)
-            warnings, list[str]: list of warnings to raise, as appropriate, if values in current manifest do
-            not pass relevant cross mannifest validation across the target manifest(s)
+            errors, list[str]: list of errors to raise, as appropriate, if values
+              in current manifest do not pass relevant cross manifest validation
+              across the target manifest(s)
+            warnings, list[str]: list of warnings to raise, as appropriate,
+              if values in current manifest do not pass relevant cross manifest
+              validation across the target manifest(s)
         """
         # Initialize with empty lists
         errors, warnings = [], []
@@ -1580,17 +1604,22 @@ def _gather_value_warnings_errors(
 
     def _check_if_target_manifest_is_empty(
         self,
-        target_manifest: pd.core.series.Series,
+        target_manifest: pd.DataFrame,
         target_manifest_empty: list[bool],
         column_names: dict[str, str],
     ) -> list[bool]:
-        """If a target manifest is found with the attribute column of interest check to see if the manifest is empty.
+        """
+        If a target manifest is found with the attribute column of interest check to see if
+          the manifest is empty.
+
         Args:
-            target_manifest, pd.core.series.Series: Current target manifest
-            target_manifest_empty, list[bool]: a list of booleans recording if the target manifest are emtpy or not.
+            target_manifest, pd.Dataframe: Current target manifest
+            target_manifest_empty, list[bool]: a list of booleans recording if the target manifest
+              are empty or not.
             column_names, dict[str, str]: {stripped_col_name:original_column_name}
         Returns:
-            target_manifest_empty, list[bool]: a list of booleans recording if the target manifest are emtpy or not.
+            target_manifest_empty, list[bool]: a list of booleans recording if the target manifest
+              are empty or not.
         """
         # Make a copy of the target manifest with only user uploaded columns
         target_manifest_dupe = target_manifest.drop(
@@ -2040,7 +2069,7 @@ def cross_validation(
     def filename_validation(
         self,
         val_rule: str,
-        manifest: pd.core.frame.DataFrame,
+        manifest: pd.DataFrame,
         access_token: str,
         dataset_scope: str,
         project_scope: Optional[list] = None,
@@ -2050,7 +2079,7 @@ def filename_validation(
             Validate the filenames in the manifest against the data paths in the fileview.
         Args:
             val_rule: str, Validation rule for the component
-            manifest: pd.core.frame.DataFrame, manifest
+            manifest: pd.DataFrame, manifest
             access_token: str, Asset Store access token
             dataset_scope: str, Dataset with files to validate against
             project_scope: Optional[list] = None: Projects to limit the scope of cross manifest validation to.