Skip to content

Commit

Permalink
get all file_reader provenance info programmatically (#316)
Browse files Browse the repository at this point in the history
  • Loading branch information
troyraen authored May 21, 2024
1 parent 80991e9 commit 8b19bc2
Showing 1 changed file with 1 addition and 34 deletions.
35 changes: 1 addition & 34 deletions src/hipscat_import/catalog/file_readers.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,13 +78,13 @@ def read(self, input_file, read_columns=None):
DataFrame containing chunk of file info.
"""

@abc.abstractmethod
def provenance_info(self) -> dict:
"""Create dictionary of parameters for provenance tracking.
Returns:
dictionary with all argument_name -> argument_value as key -> value pairs.
"""
return {"input_reader_type": type(self).__name__, **vars(self)}

def regular_file_exists(self, input_file, storage_options: Union[Dict[Any, Any], None] = None, **_kwargs):
"""Check that the `input_file` points to a single regular file
Expand Down Expand Up @@ -172,20 +172,6 @@ def read(self, input_file, read_columns=None):
) as reader:
yield from reader

def provenance_info(self) -> dict:
str_kwargs = {}
if self.type_map:
str_kwargs = {key: str(value) for (key, value) in self.kwargs.items()}
provenance_info = {
"input_reader_type": "CsvReader",
"chunksize": self.chunksize,
"schema_file": self.schema_file,
"column_names": self.column_names,
"parquet_kwargs": self.parquet_kwargs,
"kwargs": str_kwargs,
}
return provenance_info


class AstropyEcsvReader(InputReader):
"""Reads astropy ascii .ecsv files.
Expand All @@ -209,9 +195,6 @@ def read(self, input_file, read_columns=None):
astropy_table = ascii_reader.read(input_file, format="ecsv", **self.kwargs)
yield astropy_table.to_pandas()

def provenance_info(self):
return {"input_reader_type": "AstropyEcsvReader"}


class FitsReader(InputReader):
"""Chunked FITS file reader.
Expand Down Expand Up @@ -274,15 +257,6 @@ def read(self, input_file, read_columns=None):

read_rows += self.chunksize

def provenance_info(self) -> dict:
provenance_info = {
"input_reader_type": "FitsReader",
"chunksize": self.chunksize,
"column_names": self.column_names,
"skip_column_names": self.skip_column_names,
}
return provenance_info


class ParquetReader(InputReader):
"""Parquet reader for the most common Parquet reading arguments.
Expand Down Expand Up @@ -310,10 +284,3 @@ def read(self, input_file, read_columns=None):
batch_size=self.chunksize, columns=columns, use_pandas_metadata=True
):
yield smaller_table.to_pandas()

def provenance_info(self) -> dict:
provenance_info = {
"input_reader_type": "ParquetReader",
"chunksize": self.chunksize,
}
return provenance_info

0 comments on commit 8b19bc2

Please sign in to comment.