Skip to content

Commit

Permalink
make ParquetReader.read use read_columns kwarg. add kwarg to init.
Browse files Browse the repository at this point in the history
  • Loading branch information
troyraen committed May 17, 2024
1 parent 06b8379 commit ade16d2
Showing 1 changed file with 8 additions and 2 deletions.
10 changes: 8 additions & 2 deletions src/hipscat_import/catalog/file_readers.py
Original file line number Diff line number Diff line change
Expand Up @@ -291,18 +291,24 @@ class ParquetReader(InputReader):
chunksize (int): number of rows of the file to process at once.
For large files, this can prevent loading the entire file
into memory at once.
column_names (list[str] or None): Names of columns to use from the input dataset.
If None, use all columns.
kwargs: arguments to pass along to pyarrow.parquet.ParquetFile.
See https://arrow.apache.org/docs/python/generated/pyarrow.parquet.ParquetFile.html
"""

def __init__(self, chunksize=500_000, **kwargs):
def __init__(self, chunksize=500_000, column_names=None, **kwargs):
self.chunksize = chunksize
self.column_names = column_names
self.kwargs = kwargs

def read(self, input_file, read_columns=None):
self.regular_file_exists(input_file, **self.kwargs)
columns = read_columns or self.column_names
parquet_file = pq.ParquetFile(input_file, **self.kwargs)
for smaller_table in parquet_file.iter_batches(batch_size=self.chunksize, use_pandas_metadata=True):
for smaller_table in parquet_file.iter_batches(
batch_size=self.chunksize, columns=columns, use_pandas_metadata=True
):
yield smaller_table.to_pandas()

def provenance_info(self) -> dict:
Expand Down

0 comments on commit ade16d2

Please sign in to comment.