|
8 | 8 | import pyarrow as pa |
9 | 9 | import pyarrow.fs |
10 | 10 | import pyarrow.parquet as pq |
| 11 | +from pyarrow.lib import ArrowInvalid |
11 | 12 | from upath import UPath |
12 | 13 |
|
13 | 14 | from ..series.dtype import NestedDtype |
@@ -95,6 +96,12 @@ def read_parquet( |
95 | 96 | like ```pd.read_parquet("data.parquet", columns=["nested.a", "nested"])``` |
96 | 97 | from working, as this implies both full and partial load of "nested". |
97 | 98 |
|
| 99 | + Additionally with partial loading, be aware that nested-pandas (and pyarrow) |
| 100 | + only supports partial loading of struct of list columns. Your data may be |
| 101 | + stored as a list of structs, which can be read by nested-pandas, but without |
| 102 | + support for partial loading. We try to throw a helpful error message in these |
| 103 | + cases. |
| 104 | +
|
98 | 105 | Furthermore, there are some cases where subcolumns will have the same name |
99 | 106 | as a top-level column. For example, if you have a column "nested" with |
100 | 107 | subcolumns "nested.a" and "nested.b", and also a top-level column "a". In |
@@ -214,25 +221,70 @@ def _read_parquet_into_table( |
214 | 221 | return _read_remote_parquet_directory( |
215 | 222 | path_to_data, filesystem, storage_options, columns, **kwargs |
216 | 223 | ) |
| 224 | + |
217 | 225 | with fsspec.parquet.open_parquet_file( |
218 | 226 | path_to_data.path, |
219 | 227 | columns=columns, |
220 | 228 | storage_options=storage_options, |
221 | 229 | fs=filesystem, |
222 | 230 | engine="pyarrow", |
223 | 231 | ) as parquet_file: |
224 | | - return pq.read_table(parquet_file, columns=columns, **kwargs) |
| 232 | + return _read_table_with_partial_load_check(parquet_file, columns=columns, **kwargs) |
225 | 233 |
|
226 | 234 | # All other cases, including file-like objects, directories, and |
227 | 235 | # even lists of the foregoing. |
228 | 236 |
|
229 | 237 | # If `filesystem` is specified - use it, passing it as part of **kwargs |
230 | 238 | if kwargs.get("filesystem") is not None: |
231 | | - return pq.read_table(data, columns=columns, **kwargs) |
| 239 | + return _read_table_with_partial_load_check(data, columns=columns, **kwargs) |
232 | 240 |
|
233 | 241 | # Otherwise convert with a special function |
234 | 242 | data, filesystem = _transform_read_parquet_data_arg(data) |
235 | | - return pq.read_table(data, filesystem=filesystem, columns=columns, **kwargs) |
| 243 | + return _read_table_with_partial_load_check(data, columns=columns, filesystem=filesystem, **kwargs) |
| 244 | + |
| 245 | + |
| 246 | +def _read_table_with_partial_load_check(data, columns=None, filesystem=None, **kwargs): |
| 247 | + """Read a pyarrow table with partial load check for nested structures""" |
| 248 | + try: |
| 249 | + return pq.read_table(data, columns=columns, **kwargs) |
| 250 | + except ArrowInvalid as e: |
| 251 | + # if it's not related to partial loading of nested structures, re-raise |
| 252 | + if "No match for" not in str(e): |
| 253 | + raise e |
| 254 | + if columns is not None: |
| 255 | + check_schema = any("." in col for col in columns) # Check for potential partial loads |
| 256 | + if check_schema: |
| 257 | + try: |
| 258 | + _validate_structs_from_schema(data, columns=columns, filesystem=filesystem) |
| 259 | + except ValueError as validation_error: |
| 260 | + raise validation_error from e # Chain the exceptions for better context |
| 261 | + raise e |
| 262 | + |
| 263 | + |
| 264 | +def _validate_structs_from_schema(data, columns=None, filesystem=None): |
| 265 | + """Validate that nested columns are structs""" |
| 266 | + if columns is not None: |
| 267 | + schema = pq.read_schema(data, filesystem=filesystem) |
| 268 | + for col in columns: |
| 269 | + # check if column is a partial load of a nested structure |
| 270 | + if "." in col: |
| 271 | + # first check if column exists as a top-level column |
| 272 | + if col in schema.names: |
| 273 | + continue |
| 274 | + # if not, inspect the base column name type |
| 275 | + else: |
| 276 | + if col.split(".")[0] in schema.names: |
| 277 | + # check if the column is a list-struct |
| 278 | + col_type = schema.field(col.split(".")[0]).type |
| 279 | + if not pa.types.is_struct(col_type): |
| 280 | + base_col = col.split(".")[0] |
| 281 | + raise ValueError( |
| 282 | + f"The provided column '{col}' signals to partially load a nested structure, " |
| 283 | + f"but the nested structure '{base_col}' is not a struct. " |
| 284 | + "Partial loading of nested structures is only supported for struct of list " |
| 285 | + f"columns. To resolve this, fully load the column '{base_col}' " |
| 286 | + f"instead of partially loading it and perform column selection afterwards." |
| 287 | + ) |
236 | 288 |
|
237 | 289 |
|
238 | 290 | def _is_local_dir(upath: UPath) -> bool: |
@@ -273,7 +325,7 @@ def _read_remote_parquet_directory( |
273 | 325 | fs=filesystem, |
274 | 326 | engine="pyarrow", |
275 | 327 | ) as parquet_file: |
276 | | - table = pq.read_table(parquet_file, columns=columns, **kwargs) |
| 328 | + table = _read_table_with_partial_load_check(parquet_file, columns=columns, **kwargs) |
277 | 329 | tables.append(table) |
278 | 330 | return pa.concat_tables(tables) |
279 | 331 |
|
|
0 commit comments