diff --git a/src/hipscat_import/catalog/arguments.py b/src/hipscat_import/catalog/arguments.py index a05ea9d4..fb43b35b 100644 --- a/src/hipscat_import/catalog/arguments.py +++ b/src/hipscat_import/catalog/arguments.py @@ -6,12 +6,12 @@ from typing import List from hipscat.catalog.catalog import CatalogInfo -from hipscat.io import FilePointer, file_io +from hipscat.io import FilePointer from hipscat.pixel_math import hipscat_id from hipscat_import.catalog.file_readers import InputReader, get_file_reader from hipscat_import.catalog.resume_plan import ResumePlan -from hipscat_import.runtime_arguments import RuntimeArguments +from hipscat_import.runtime_arguments import RuntimeArguments, find_input_paths # pylint: disable=too-many-locals,too-many-arguments,too-many-instance-attributes,too-many-branches,too-few-public-methods @@ -102,14 +102,7 @@ def _check_arguments(self): self.file_reader = get_file_reader(self.input_format) # Basic checks complete - make more checks and create directories where necessary - if self.input_path: - if not file_io.does_file_or_directory_exist(self.input_path): - raise FileNotFoundError("input_path not found on local storage") - self.input_paths = file_io.find_files_matching_path(self.input_path, f"*{self.input_format}") - elif self.input_file_list: - self.input_paths = self.input_file_list - if len(self.input_paths) == 0: - raise FileNotFoundError("No input files found") + self.input_paths = find_input_paths(self.input_path, f"*{self.input_format}", self.input_file_list) self.resume_plan = ResumePlan( resume=self.resume, progress_bar=self.progress_bar, diff --git a/src/hipscat_import/cross_match/macauff_arguments.py b/src/hipscat_import/cross_match/macauff_arguments.py index 93f3bca6..74283f18 100644 --- a/src/hipscat_import/cross_match/macauff_arguments.py +++ b/src/hipscat_import/cross_match/macauff_arguments.py @@ -4,10 +4,10 @@ from os import path from typing import List -from hipscat.io import FilePointer, file_io +from hipscat.io import FilePointer from hipscat.io.validation import is_valid_catalog -from hipscat_import.runtime_arguments import RuntimeArguments +from hipscat_import.runtime_arguments import RuntimeArguments, find_input_paths # pylint: disable=too-many-instance-attributes # pylint: disable=unsupported-binary-operation @@ -87,14 +87,7 @@ def _check_arguments(self): raise ValueError("Macauff column metadata file must point to valid file path.") # Basic checks complete - make more checks and create directories where necessary - if self.input_path: - if not file_io.does_file_or_directory_exist(self.input_path): - raise FileNotFoundError("input_path not found on local storage") - self.input_paths = file_io.find_files_matching_path(self.input_path, f"*{self.input_format}") - elif self.input_file_list: - self.input_paths = self.input_file_list - if len(self.input_paths) == 0: - raise FileNotFoundError("No input files found") + self.input_paths = find_input_paths(self.input_path, f"*{self.input_format}", self.input_file_list) self.column_names = self.get_column_names() diff --git a/src/hipscat_import/runtime_arguments.py b/src/hipscat_import/runtime_arguments.py index f4e227e1..cd924888 100644 --- a/src/hipscat_import/runtime_arguments.py +++ b/src/hipscat_import/runtime_arguments.py @@ -124,3 +124,27 @@ def provenance_info(self) -> dict: def additional_runtime_provenance_info(self): """Any additional runtime args to be included in provenance info from subclasses""" return {} + + +def find_input_paths(input_path="", file_matcher="", input_file_list=None): + """Helper method to find input paths, given either a prefix and format, or an + explicit list of paths. + + Args: + input_path (str): prefix to search for + file_matcher (str): matcher to use when searching for files + input_file_list (List[str]): list of input paths + Returns: + matching files, if input_path is provided, otherwise, input_file_list + Raises: + FileNotFoundError if no files are found at the input_path and the provided list is empty. + """ + if input_path: + if not file_io.does_file_or_directory_exist(input_path): + raise FileNotFoundError("input_path not found on local storage") + input_paths = file_io.find_files_matching_path(input_path, file_matcher) + elif input_file_list: + input_paths = input_file_list + if len(input_paths) == 0: + raise FileNotFoundError("No input files found") + return input_paths