33
44from pathlib import Path
55
6+ import fsspec .parquet
67import pandas as pd
78import pyarrow as pa
89import pyarrow .fs
1415from ..series .utils import table_to_struct_array
1516from .core import NestedFrame
1617
17- # Use smaller block size for FSSPEC filesystems, it usually helps with parquet reads
18+ # Use smaller block size for these FSSPEC filesystems.
19+ # It usually helps with parquet read speed.
20+ FSSPEC_FILESYSTEMS = ("http" , "https" )
1821FSSPEC_BLOCK_SIZE = 32 * 1024
1922
2023
@@ -25,19 +28,29 @@ def read_parquet(
2528 autocast_list : bool = False ,
2629 ** kwargs ,
2730) -> NestedFrame :
28- """
29- Load a parquet object from a file path into a NestedFrame.
31+ """Load a parquet object from a file path into a NestedFrame.
3032
31- As a deviation from `pandas`, this function loads via
32- `pyarrow.parquet.read_table`, and then converts to a NestedFrame.
33+ As a specialization of the ``pandas.read_parquet`` function, this
34+ function loads the data via existing ``pyarrow`` or
35+ ``fsspec.parquet`` methods, and then converts the data to a
36+ NestedFrame.
3337
3438 Parameters
3539 ----------
3640 data: str, list or str, Path, Upath, or file-like object
37- Path to the data or a file-like object. If a string is passed, it can be a single file name,
38- directory name, or a remote path (e.g., HTTP/HTTPS or S3). If a file-like object is passed,
39- it must support the `read` method. You can also pass the `filesystem` argument with
40- a `pyarrow.fs` object, which will be passed to `pyarrow.parquet.read_table()`.
41+ Path to the data or a file-like object. If a string is passed,
42+ it can be a single file name, directory name, or a remote path
43+ (e.g., HTTP/HTTPS or S3). If a file-like object is passed, it
44+ must support the ``read`` method. You can also pass a
45+ ``filesystem`` keyword argument with a ``pyarrow.fs`` object, which will
46+ be passed along to the underlying file-reading method.
47+ A file URL can also be a path to a directory that contains multiple
48+ partitioned parquet files. Both pyarrow and fastparquet support
49+ paths to directories as well as file URLs. A directory path could be:
50+ ``file://localhost/path/to/tables`` or ``s3://bucket/partition_dir``.
51+ If the path is to a single Parquet file, it will be loaded using
52+ ``fsspec.parquet.open_parquet_file``, which has optimized handling
53+ for remote Parquet files.
4154 columns : list, default=None
4255 If not None, only these columns will be read from the file.
4356 reject_nesting: list or str, default=None
@@ -57,6 +70,11 @@ def read_parquet(
5770
5871 Notes
5972 -----
73+ For paths to single Parquet files, this function uses
74+ fsspec.parquet.open_parquet_file, which performs intelligent
75+ precaching. This can significantly improve performance compared
76+ to standard PyArrow reading on remote files.
77+
6078 pyarrow supports partial loading of nested structures from parquet, for
6179 example ```pd.read_parquet("data.parquet", columns=["nested.a"])``` will
6280 load the "a" column of the "nested" column. Standard pandas/pyarrow
@@ -85,6 +103,7 @@ def read_parquet(
85103
86104 >>> #Load only the "flux" sub-column of the "nested" column
87105 >>> nf = npd.read_parquet("path/to/file.parquet", columns=["a", "nested.flux"]) # doctest: +SKIP
106+
88107 """
89108
90109 # Type convergence for reject_nesting
@@ -93,14 +112,41 @@ def read_parquet(
93112 elif isinstance (reject_nesting , str ):
94113 reject_nesting = [reject_nesting ]
95114
96- # First load through pyarrow
97- # If `filesystem` is specified - use it
98- if kwargs .get ("filesystem" ) is not None :
99- table = pq .read_table (data , columns = columns , ** kwargs )
100- # Otherwise convert with a special function
115+ # For single Parquet file paths, we want to use
116+ # `fsspec.parquet.open_parquet_file`. But for any other usage
117+ # (which includes file-like objects, directories and lists
118+ # thereof), we want to defer to `pq.read_table`.
119+
120+ # At the end of this block, `table` will contain the data.
121+
122+ # NOTE: the test for _is_local_dir is sufficient, because we're
123+ # preserving a path to pq.read_table, which can read local
124+ # directories, but not remote directories. Remote directories
125+ # cannot be read by either of these methods.
126+ if isinstance (data , str | Path | UPath ) and not _is_local_dir (path_to_data := UPath (data )):
127+ storage_options = _get_storage_options (path_to_data )
128+ filesystem = kwargs .get ("filesystem" )
129+ if not filesystem :
130+ _ , filesystem = _transform_read_parquet_data_arg (path_to_data )
131+ with fsspec .parquet .open_parquet_file (
132+ str (path_to_data ),
133+ columns = columns ,
134+ storage_options = storage_options ,
135+ fs = filesystem ,
136+ engine = "pyarrow" ,
137+ ) as parquet_file :
138+ table = pq .read_table (parquet_file , columns = columns , ** kwargs )
101139 else :
102- data , filesystem = _transform_read_parquet_data_arg (data )
103- table = pq .read_table (data , filesystem = filesystem , columns = columns , ** kwargs )
140+ # All other cases, including file-like objects, directories, and
141+ # even lists of the foregoing.
142+
143+ # If `filesystem` is specified - use it, passing it as part of **kwargs
144+ if kwargs .get ("filesystem" ) is not None :
145+ table = pq .read_table (data , columns = columns , ** kwargs )
146+ else :
147+ # Otherwise convert with a special function
148+ data , filesystem = _transform_read_parquet_data_arg (data )
149+ table = pq .read_table (data , filesystem = filesystem , columns = columns , ** kwargs )
104150
105151 # Resolve partial loading of nested structures
106152 # Using pyarrow to avoid naming conflicts from partial loading ("flux" vs "lc.flux")
@@ -160,6 +206,41 @@ def read_parquet(
160206 return from_pyarrow (table , reject_nesting = reject_nesting , autocast_list = autocast_list )
161207
162208
209+ def _is_local_dir (path_to_data : UPath ):
210+ """Returns True if the given path refers to a local directory.
211+
212+ It's necessary to have this function, rather than simply checking
213+ ``UPath(p).is_dir()``, because ``UPath.is_dir`` can be quite
214+ expensive in the case of a remote file path that isn't a directory.
215+ """
216+ return path_to_data .protocol in ("" , "file" ) and path_to_data .is_dir ()
217+
218+
219+ def _get_storage_options (path_to_data : UPath ):
220+ """Get storage options for fsspec.parquet.open_parquet_file.
221+
222+ Parameters
223+ ----------
224+ path_to_data : UPath
225+ The data source
226+
227+ Returns
228+ -------
229+ dict
230+ Storage options (or None)
231+ """
232+ if path_to_data .protocol not in ("" , "file" ):
233+ # Remote files of all types (s3, http)
234+ storage_options = path_to_data .storage_options or {}
235+ # For some cases, use smaller block size
236+ if path_to_data .protocol in FSSPEC_FILESYSTEMS :
237+ storage_options = {** storage_options , "block_size" : FSSPEC_BLOCK_SIZE }
238+ return storage_options
239+
240+ # Local files
241+ return None
242+
243+
163244def _transform_read_parquet_data_arg (data ):
164245 """Transform `data` argument of read_parquet to pq.read_parquet's `source` and `filesystem`"""
165246 # Check if a list, run the function recursively and check that filesystems are all the same
@@ -204,8 +285,8 @@ def _transform_read_parquet_data_arg(data):
204285 # If it is a local path, use pyarrow's filesystem
205286 if upath .protocol == "" :
206287 return upath .path , None
207- # If HTTP, change the default UPath object to use a smaller block size
208- if upath .protocol in ( "http" , "https" ) :
288+ # Change the default UPath object to use a smaller block size in some cases
289+ if upath .protocol in FSSPEC_FILESYSTEMS :
209290 upath = UPath (upath , block_size = FSSPEC_BLOCK_SIZE )
210291 return upath .path , upath .fs
211292
0 commit comments