Merge branch 'main' into raen/verify/files

astronomy-commons · Sep 18, 2024 · 3855e94 · 3855e94
2 parents a6d24e4 + 829fe47
commit 3855e94
Show file tree

Hide file tree

Showing 30 changed files with 217 additions and 332 deletions.
diff --git a/docs/catalogs/arguments.rst b/docs/catalogs/arguments.rst
@@ -169,7 +169,7 @@ You can find the full API documentation for
     )
 
 If you're reading from cloud storage, or otherwise have some filesystem credential
-dict, put those in ``input_storage_options``.
+dict, initialize ``input_file`` using ``universal_pathlib``'s utilities.
 
 Indexed batching strategy
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -304,7 +304,7 @@ preferable to delete any existing contents, however, as this may cause
 unexpected side effects.
 
 If you're writing to cloud storage, or otherwise have some filesystem credential
-dict, put those in ``output_storage_options``.
+dict, initialize ``output_path`` using ``universal_pathlib``'s utilities.
 
 In addition, you can specify directories to use for various intermediate files:
 

diff --git a/docs/guide/index_table.rst b/docs/guide/index_table.rst
@@ -229,7 +229,7 @@ preferable to delete any existing contents, however, as this may cause
 unexpected side effects.
 
 If you're writing to cloud storage, or otherwise have some filesystem credential
-dict, put those in ``output_storage_options``.
+dict, initialize ``output_path`` using ``universal_pathlib``'s utilities.
 
 In addition, you can specify directories to use for various intermediate files:
 

diff --git a/docs/guide/margin_cache.rst b/docs/guide/margin_cache.rst
@@ -141,7 +141,7 @@ preferable to delete any existing contents, however, as this may cause
 unexpected side effects.
 
 If you're writing to cloud storage, or otherwise have some filesystem credential
-dict, put those in ``output_storage_options``.
+dict, initialize ``output_path`` using ``universal_pathlib``'s utilities.
 
 In addition, you can specify directories to use for various intermediate files:
 

diff --git a/pyproject.toml b/pyproject.toml
@@ -25,6 +25,7 @@ dependencies = [
     "pyyaml",
     "scipy",
     "tqdm",
+    "universal_pathlib",
 ]
 
 # On a mac, install optional dependencies with `pip install '.[dev]'` (include the single quotes)

diff --git a/src/hipscat_import/catalog/arguments.py b/src/hipscat_import/catalog/arguments.py
@@ -3,11 +3,12 @@
 from __future__ import annotations
 
 from dataclasses import dataclass, field
-from typing import Any, Dict, List, Union
+from pathlib import Path
+from typing import List
 
 from hipscat.catalog.catalog import CatalogInfo
-from hipscat.io import FilePointer
 from hipscat.pixel_math import hipscat_id
+from upath import UPath
 
 from hipscat_import.catalog.file_readers import InputReader, get_file_reader
 from hipscat_import.runtime_arguments import RuntimeArguments, find_input_paths
@@ -24,14 +25,12 @@ class ImportArguments(RuntimeArguments):
 
     catalog_type: str = "object"
     """level of catalog data, object (things in the sky) or source (detections)"""
-    input_path: FilePointer | None = None
+    input_path: str | Path | UPath | None = None
     """path to search for the input data"""
-    input_file_list: List[FilePointer] = field(default_factory=list)
+    input_file_list: List[str | Path | UPath] = field(default_factory=list)
     """can be used instead of input_path to import only specified files"""
-    input_paths: List[FilePointer] = field(default_factory=list)
+    input_paths: List[str | Path | UPath] = field(default_factory=list)
     """resolved list of all files that will be used in the importer"""
-    input_storage_options: Union[Dict[Any, Any], None] = None
-    """optional dictionary of abstract filesystem credentials for the INPUT."""
 
     ra_column: str = "ra"
     """column for right ascension"""
@@ -45,7 +44,7 @@ class ImportArguments(RuntimeArguments):
     resolve the counter within the same higher-order pixel space"""
     add_hipscat_index: bool = True
     """add the hipscat spatial index field alongside the data"""
-    use_schema_file: str | None = None
+    use_schema_file: str | Path | UPath | None = None
     """path to a parquet file with schema metadata. this will be used for column
     metadata when writing the files, if specified"""
     expected_total_rows: int = 0
@@ -130,12 +129,7 @@ def _check_arguments(self):
                 raise ValueError("When using _hipscat_index for position, no sort columns should be added")
 
         # Basic checks complete - make more checks and create directories where necessary
-        self.input_paths = find_input_paths(
-            self.input_path,
-            "**/*.*",
-            self.input_file_list,
-            storage_options=self.input_storage_options,
-        )
+        self.input_paths = find_input_paths(self.input_path, "**/*.*", self.input_file_list)
 
     def to_catalog_info(self, total_rows) -> CatalogInfo:
         """Catalog-type-specific dataset info."""

diff --git a/src/hipscat_import/catalog/file_readers.py b/src/hipscat_import/catalog/file_readers.py
@@ -1,15 +1,15 @@
 """File reading generators for common file types."""
 
 import abc
-from typing import Any, Dict, Union
 
 import pandas as pd
 import pyarrow
 import pyarrow.dataset
 import pyarrow.parquet as pq
 from astropy.io import ascii as ascii_reader
 from astropy.table import Table
-from hipscat.io import FilePointer, file_io
+from hipscat.io import file_io
+from upath import UPath
 
 # pylint: disable=too-few-public-methods,too-many-arguments
 
@@ -113,30 +113,40 @@ def provenance_info(self) -> dict:
             all_args["kwargs"]["storage_options"] = "REDACTED"
         return {"input_reader_type": type(self).__name__, **vars(self)}
 
-    def regular_file_exists(self, input_file, storage_options: Union[Dict[Any, Any], None] = None, **_kwargs):
+    def regular_file_exists(self, input_file, **_kwargs):
         """Check that the `input_file` points to a single regular file
 
         Raises:
             FileNotFoundError: if nothing exists at path, or directory found.
         """
-        if not file_io.does_file_or_directory_exist(input_file, storage_options=storage_options):
+        if not file_io.does_file_or_directory_exist(input_file):
             raise FileNotFoundError(f"File not found at path: {input_file}")
-        if not file_io.is_regular_file(input_file, storage_options=storage_options):
+        if not file_io.is_regular_file(input_file):
             raise FileNotFoundError(f"Directory found at path - requires regular file: {input_file}")
 
-    def read_index_file(self, input_file, storage_options: Union[Dict[Any, Any], None] = None, **kwargs):
+    def read_index_file(self, input_file, upath_kwargs=None, **kwargs):
         """Read an "indexed" file.
 
         This should contain a list of paths to files to be read and batched.
 
+        In order to create a valid connection to the string paths, provide any
+        additional universal pathlib (i.e. fsspec) arguments to the `upath_kwargs` kwarg.
+        In this way, the "index" file may contain a list of paths on a remote service,
+        and the `upath_kwargs` will be used to create a connection to that remote service.
+
         Raises:
             FileNotFoundError: if nothing exists at path, or directory found.
         """
+        input_file = file_io.get_upath(input_file)
         self.regular_file_exists(input_file, **kwargs)
-        file_names = file_io.load_text_file(input_file, storage_options=storage_options)
+        file_names = file_io.load_text_file(input_file)
         file_names = [f.strip() for f in file_names]
-        file_names = [f for f in file_names if f]
-        return file_names
+        if upath_kwargs is None:
+            upath_kwargs = {}
+
+        file_paths = [UPath(f, **upath_kwargs) for f in file_names if f]
+
+        return file_paths
 
 
 class CsvReader(InputReader):
@@ -170,6 +180,7 @@ def __init__(
         column_names=None,
         type_map=None,
         parquet_kwargs=None,
+        upath_kwargs=None,
         **kwargs,
     ):
         self.chunksize = chunksize
@@ -178,14 +189,15 @@ def __init__(
         self.column_names = column_names
         self.type_map = type_map
         self.parquet_kwargs = parquet_kwargs
+        self.upath_kwargs = upath_kwargs
         self.kwargs = kwargs
 
         schema_parquet = None
         if self.schema_file:
             if self.parquet_kwargs is None:
                 self.parquet_kwargs = {}
             schema_parquet = file_io.read_parquet_file_to_pandas(
-                FilePointer(self.schema_file),
+                self.schema_file,
                 **self.parquet_kwargs,
             )
 
@@ -206,7 +218,7 @@ def read(self, input_file, read_columns=None):
             self.kwargs["usecols"] = read_columns
 
         return file_io.load_csv_to_pandas_generator(
-            FilePointer(input_file),
+            input_file,
             chunksize=self.chunksize,
             header=self.header,
             **self.kwargs,
@@ -220,11 +232,13 @@ class IndexedCsvReader(CsvReader):
     """
 
     def read(self, input_file, read_columns=None):
-        file_names = self.read_index_file(input_file=input_file, **self.kwargs)
+        file_paths = self.read_index_file(
+            input_file=input_file, upath_kwargs=self.upath_kwargs, **self.kwargs
+        )
 
         batch_size = 0
         batch_frames = []
-        for file in file_names:
+        for file in file_paths:
             for single_frame in super().read(file, read_columns=read_columns):
                 if batch_size + len(single_frame) >= self.chunksize:
                     # We've hit our chunksize, send the batch off to the task.
@@ -382,18 +396,22 @@ def __init__(
         fragment_readahead=4,
         use_threads=True,
         column_names=None,
+        upath_kwargs=None,
         **kwargs,
     ):
         self.chunksize = chunksize
         self.batch_readahead = batch_readahead
         self.fragment_readahead = fragment_readahead
         self.use_threads = use_threads
         self.column_names = column_names
+        self.upath_kwargs = upath_kwargs
         self.kwargs = kwargs
 
     def read(self, input_file, read_columns=None):
         columns = read_columns or self.column_names
-        file_names = self.read_index_file(input_file=input_file, **self.kwargs)
+        file_names = self.read_index_file(
+            input_file=input_file, upath_kwargs=self.upath_kwargs, **self.kwargs
+        )
         (_, input_dataset) = file_io.read_parquet_dataset(file_names, **self.kwargs)
 
         batches, nrows = [], 0